LCOV - code coverage report
Current view: top level - gcc - tree-vect-slp.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 92.4 % 5957 5503
Test Date: 2026-02-28 14:20:25 Functions: 95.0 % 180 171
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* SLP - Basic Block Vectorization
       2              :    Copyright (C) 2007-2026 Free Software Foundation, Inc.
       3              :    Contributed by Dorit Naishlos <dorit@il.ibm.com>
       4              :    and Ira Rosen <irar@il.ibm.com>
       5              : 
       6              : This file is part of GCC.
       7              : 
       8              : GCC is free software; you can redistribute it and/or modify it under
       9              : the terms of the GNU General Public License as published by the Free
      10              : Software Foundation; either version 3, or (at your option) any later
      11              : version.
      12              : 
      13              : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      14              : WARRANTY; without even the implied warranty of MERCHANTABILITY or
      15              : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      16              : for more details.
      17              : 
      18              : You should have received a copy of the GNU General Public License
      19              : along with GCC; see the file COPYING3.  If not see
      20              : <http://www.gnu.org/licenses/>.  */
      21              : 
      22              : #include "config.h"
      23              : #define INCLUDE_ALGORITHM
      24              : #include "system.h"
      25              : #include "coretypes.h"
      26              : #include "backend.h"
      27              : #include "target.h"
      28              : #include "rtl.h"
      29              : #include "tree.h"
      30              : #include "gimple.h"
      31              : #include "tree-pass.h"
      32              : #include "ssa.h"
      33              : #include "optabs-tree.h"
      34              : #include "insn-config.h"
      35              : #include "recog.h"            /* FIXME: for insn_data */
      36              : #include "fold-const.h"
      37              : #include "stor-layout.h"
      38              : #include "gimple-iterator.h"
      39              : #include "cfgloop.h"
      40              : #include "tree-vectorizer.h"
      41              : #include "langhooks.h"
      42              : #include "gimple-walk.h"
      43              : #include "dbgcnt.h"
      44              : #include "tree-vector-builder.h"
      45              : #include "vec-perm-indices.h"
      46              : #include "gimple-fold.h"
      47              : #include "internal-fn.h"
      48              : #include "dump-context.h"
      49              : #include "cfganal.h"
      50              : #include "tree-eh.h"
      51              : #include "tree-cfg.h"
      52              : #include "alloc-pool.h"
      53              : #include "sreal.h"
      54              : #include "predict.h"
      55              : 
      56              : #define REDUC_GROUP_FIRST_ELEMENT(S) \
      57              :   (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
      58              : 
      59              : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
      60              :                                             load_permutation_t &,
      61              :                                             const vec<tree> &,
      62              :                                             gimple_stmt_iterator *,
      63              :                                             poly_uint64, bool, bool,
      64              :                                             unsigned *,
      65              :                                             unsigned * = nullptr,
      66              :                                             bool = false);
      67              : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
      68              :                                            slp_tree, lane_permutation_t &,
      69              :                                            vec<slp_tree> &, bool);
      70              : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
      71              : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
      72              : 
      73              : static object_allocator<_slp_tree> *slp_tree_pool;
      74              : static slp_tree slp_first_node;
      75              : 
      76              : void
      77      1117698 : vect_slp_init (void)
      78              : {
      79      1117698 :   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
      80      1117698 : }
      81              : 
      82              : void
      83      1117698 : vect_slp_fini (void)
      84              : {
      85      1673636 :   while (slp_first_node)
      86       555938 :     delete slp_first_node;
      87      2235396 :   delete slp_tree_pool;
      88      1117698 :   slp_tree_pool = NULL;
      89      1117698 : }
      90              : 
      91              : void *
      92      7100223 : _slp_tree::operator new (size_t n)
      93              : {
      94      7100223 :   gcc_assert (n == sizeof (_slp_tree));
      95      7100223 :   return slp_tree_pool->allocate_raw ();
      96              : }
      97              : 
      98              : void
      99      7100223 : _slp_tree::operator delete (void *node, size_t n)
     100              : {
     101      7100223 :   gcc_assert (n == sizeof (_slp_tree));
     102      7100223 :   slp_tree_pool->remove_raw (node);
     103      7100223 : }
     104              : 
     105              : 
     106              : /* Initialize a SLP node.  */
     107              : 
     108      7100223 : _slp_tree::_slp_tree ()
     109              : {
     110      7100223 :   this->prev_node = NULL;
     111      7100223 :   if (slp_first_node)
     112      6178246 :     slp_first_node->prev_node = this;
     113      7100223 :   this->next_node = slp_first_node;
     114      7100223 :   slp_first_node = this;
     115      7100223 :   SLP_TREE_SCALAR_STMTS (this) = vNULL;
     116      7100223 :   SLP_TREE_SCALAR_OPS (this) = vNULL;
     117      7100223 :   SLP_TREE_VEC_DEFS (this) = vNULL;
     118      7100223 :   SLP_TREE_CHILDREN (this) = vNULL;
     119      7100223 :   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
     120      7100223 :   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
     121      7100223 :   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
     122      7100223 :   SLP_TREE_CODE (this) = ERROR_MARK;
     123      7100223 :   SLP_TREE_GS_SCALE (this) = 0;
     124      7100223 :   SLP_TREE_GS_BASE (this) = NULL_TREE;
     125      7100223 :   this->ldst_lanes = false;
     126      7100223 :   this->avoid_stlf_fail = false;
     127      7100223 :   SLP_TREE_VECTYPE (this) = NULL_TREE;
     128      7100223 :   SLP_TREE_REPRESENTATIVE (this) = NULL;
     129      7100223 :   this->cycle_info.id = -1;
     130      7100223 :   this->cycle_info.reduc_idx = -1;
     131      7100223 :   SLP_TREE_REF_COUNT (this) = 1;
     132      7100223 :   this->failed = NULL;
     133      7100223 :   this->max_nunits = 1;
     134      7100223 :   this->lanes = 0;
     135      7100223 :   SLP_TREE_TYPE (this) = undef_vec_info_type;
     136      7100223 :   this->data = NULL;
     137      7100223 : }
     138              : 
     139              : /* Tear down a SLP node.  */
     140              : 
     141      7100223 : _slp_tree::~_slp_tree ()
     142              : {
     143      7100223 :   if (this->prev_node)
     144      4361657 :     this->prev_node->next_node = this->next_node;
     145              :   else
     146      2738566 :     slp_first_node = this->next_node;
     147      7100223 :   if (this->next_node)
     148      5293040 :     this->next_node->prev_node = this->prev_node;
     149      7100223 :   SLP_TREE_CHILDREN (this).release ();
     150      7100223 :   SLP_TREE_SCALAR_STMTS (this).release ();
     151      7100223 :   SLP_TREE_SCALAR_OPS (this).release ();
     152      7100223 :   SLP_TREE_VEC_DEFS (this).release ();
     153      7100223 :   SLP_TREE_LOAD_PERMUTATION (this).release ();
     154      7100223 :   SLP_TREE_LANE_PERMUTATION (this).release ();
     155      7100223 :   if (this->failed)
     156      1921916 :     free (failed);
     157      7100223 :   if (this->data)
     158      1141549 :     delete this->data;
     159      7100223 : }
     160              : 
     161              : /* Push the single SSA definition in DEF to the vector of vector defs.  */
     162              : 
     163              : void
     164       520393 : _slp_tree::push_vec_def (gimple *def)
     165              : {
     166       520393 :   if (gphi *phi = dyn_cast <gphi *> (def))
     167        58619 :     vec_defs.quick_push (gimple_phi_result (phi));
     168              :   else
     169              :     {
     170       461774 :       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
     171       461774 :       vec_defs.quick_push (get_def_from_ptr (defop));
     172              :     }
     173       520393 : }
     174              : 
     175              : /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
     176              : 
     177              : void
     178     13289730 : vect_free_slp_tree (slp_tree node)
     179              : {
     180     13289730 :   int i;
     181     13289730 :   slp_tree child;
     182              : 
     183     13289730 :   if (--SLP_TREE_REF_COUNT (node) != 0)
     184     13289730 :     return;
     185              : 
     186     10090004 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     187      3545719 :     if (child)
     188      3224309 :       vect_free_slp_tree (child);
     189              : 
     190              :   /* If the node defines any SLP only patterns then those patterns are no
     191              :      longer valid and should be removed.  */
     192      6544285 :   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
     193      6544285 :   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
     194              :     {
     195          973 :       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
     196          973 :       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
     197          973 :       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
     198              :     }
     199              : 
     200      6544285 :   delete node;
     201              : }
     202              : 
     203              : /* Return a location suitable for dumpings related to the SLP instance.  */
     204              : 
     205              : dump_user_location_t
     206      3362144 : _slp_instance::location () const
     207              : {
     208      3362144 :   if (!root_stmts.is_empty ())
     209       313823 :     return root_stmts[0]->stmt;
     210              :   else
     211      3048321 :     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
     212              : }
     213              : 
     214              : 
     215              : /* Free the memory allocated for the SLP instance.  */
     216              : 
     217              : void
     218      1448278 : vect_free_slp_instance (slp_instance instance)
     219              : {
     220      1448278 :   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
     221      1448278 :   SLP_INSTANCE_LOADS (instance).release ();
     222      1448278 :   SLP_INSTANCE_ROOT_STMTS (instance).release ();
     223      1448278 :   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
     224      1448278 :   instance->subgraph_entries.release ();
     225      1448278 :   instance->cost_vec.release ();
     226      1448278 :   free (instance);
     227      1448278 : }
     228              : 
     229              : 
     230              : /* Create an SLP node for SCALAR_STMTS.  */
     231              : 
     232              : slp_tree
     233        86705 : vect_create_new_slp_node (unsigned nops, tree_code code)
     234              : {
     235        86705 :   slp_tree node = new _slp_tree;
     236        86705 :   SLP_TREE_SCALAR_STMTS (node) = vNULL;
     237        86705 :   SLP_TREE_CHILDREN (node).create (nops);
     238        86705 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     239        86705 :   SLP_TREE_CODE (node) = code;
     240        86705 :   return node;
     241              : }
     242              : /* Create an SLP node for SCALAR_STMTS.  */
     243              : 
     244              : static slp_tree
     245      3330982 : vect_create_new_slp_node (slp_tree node,
     246              :                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
     247              : {
     248      3330982 :   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
     249      3330982 :   SLP_TREE_CHILDREN (node).create (nops);
     250      3330982 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     251      3330982 :   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
     252      3330982 :   SLP_TREE_LANES (node) = scalar_stmts.length ();
     253      3330982 :   return node;
     254              : }
     255              : 
     256              : /* Create an SLP node for SCALAR_STMTS.  */
     257              : 
     258              : static slp_tree
     259         6276 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
     260              : {
     261         6276 :   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
     262              : }
     263              : 
     264              : /* Create an SLP node for OPS.  */
     265              : 
     266              : static slp_tree
     267      1750894 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
     268              : {
     269      1750894 :   SLP_TREE_SCALAR_OPS (node) = ops;
     270      1750894 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
     271            0 :   SLP_TREE_LANES (node) = ops.length ();
     272      1750894 :   return node;
     273              : }
     274              : 
     275              : /* Create an SLP node for OPS.  */
     276              : 
     277              : static slp_tree
     278      1750894 : vect_create_new_slp_node (vec<tree> ops)
     279              : {
     280      1750894 :   return vect_create_new_slp_node (new _slp_tree, ops);
     281              : }
     282              : 
     283              : 
     284              : /* This structure is used in creation of an SLP tree.  Each instance
     285              :    corresponds to the same operand in a group of scalar stmts in an SLP
     286              :    node.  */
     287              : typedef struct _slp_oprnd_info
     288              : {
     289              :   /* Def-stmts for the operands.  */
     290              :   vec<stmt_vec_info> def_stmts;
     291              :   /* Operands.  */
     292              :   vec<tree> ops;
     293              :   /* Information about the first statement, its vector def-type, type, the
     294              :      operand itself in case it's constant, and an indication if it's a pattern
     295              :      stmt and gather/scatter info.  */
     296              :   tree first_op_type;
     297              :   enum vect_def_type first_dt;
     298              :   bool any_pattern;
     299              :   bool first_gs_p;
     300              :   gather_scatter_info first_gs_info;
     301              : } *slp_oprnd_info;
     302              : 
     303              : 
     304              : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
     305              :    operand.  */
     306              : static vec<slp_oprnd_info>
     307      2974746 : vect_create_oprnd_info (int nops, int group_size)
     308              : {
     309      2974746 :   int i;
     310      2974746 :   slp_oprnd_info oprnd_info;
     311      2974746 :   vec<slp_oprnd_info> oprnds_info;
     312              : 
     313      2974746 :   oprnds_info.create (nops);
     314     10631760 :   for (i = 0; i < nops; i++)
     315              :     {
     316      4682268 :       oprnd_info = XNEW (struct _slp_oprnd_info);
     317      4682268 :       oprnd_info->def_stmts.create (group_size);
     318      4682268 :       oprnd_info->ops.create (group_size);
     319      4682268 :       oprnd_info->first_dt = vect_uninitialized_def;
     320      4682268 :       oprnd_info->first_op_type = NULL_TREE;
     321      4682268 :       oprnd_info->any_pattern = false;
     322      4682268 :       oprnd_info->first_gs_p = false;
     323      4682268 :       oprnds_info.quick_push (oprnd_info);
     324              :     }
     325              : 
     326      2974746 :   return oprnds_info;
     327              : }
     328              : 
     329              : 
     330              : /* Free operands info.  */
     331              : 
     332              : static void
     333      2974746 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
     334              : {
     335      2974746 :   int i;
     336      2974746 :   slp_oprnd_info oprnd_info;
     337              : 
     338      7657014 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     339              :     {
     340      4682268 :       oprnd_info->def_stmts.release ();
     341      4682268 :       oprnd_info->ops.release ();
     342      4682268 :       XDELETE (oprnd_info);
     343              :     }
     344              : 
     345      2974746 :   oprnds_info.release ();
     346      2974746 : }
     347              : 
     348              : /* Return the execution frequency of NODE (so that a higher value indicates
     349              :    a "more important" node when optimizing for speed).  */
     350              : 
     351              : static sreal
     352      3130747 : vect_slp_node_weight (slp_tree node)
     353              : {
     354      3130747 :   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
     355      3130747 :   basic_block bb = gimple_bb (stmt_info->stmt);
     356      3130747 :   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
     357              : }
     358              : 
     359              : /* Return true if STMTS contains a pattern statement.  */
     360              : 
     361              : static bool
     362        22141 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
     363              : {
     364        22141 :   stmt_vec_info stmt_info;
     365        22141 :   unsigned int i;
     366        71725 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
     367        51784 :     if (stmt_info && is_pattern_stmt_p (stmt_info))
     368              :       return true;
     369              :   return false;
     370              : }
     371              : 
     372              : /* Return true when all lanes in the external or constant NODE have
     373              :    the same value.  */
     374              : 
     375              : static bool
     376       589969 : vect_slp_tree_uniform_p (slp_tree node)
     377              : {
     378       589969 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
     379              :               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
     380              : 
     381              :   /* Pre-exsting vectors.  */
     382      1038884 :   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
     383              :     return false;
     384              : 
     385              :   unsigned i;
     386              :   tree op, first = NULL_TREE;
     387      1349884 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
     388      1208830 :     if (!first)
     389              :       first = op;
     390       618861 :     else if (!operand_equal_p (first, op, 0))
     391              :       return false;
     392              : 
     393              :   return true;
     394              : }
     395              : 
     396              : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
     397              :    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
     398              :    of the chain.  */
     399              : 
     400              : int
     401       652513 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
     402              :                                       stmt_vec_info first_stmt_info)
     403              : {
     404       652513 :   stmt_vec_info next_stmt_info = first_stmt_info;
     405       652513 :   int result = 0;
     406              : 
     407       652513 :   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
     408              :     return -1;
     409              : 
     410      1638960 :   do
     411              :     {
     412      1638960 :       if (next_stmt_info == stmt_info)
     413              :         return result;
     414       986447 :       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
     415       986447 :       if (next_stmt_info)
     416       986447 :         result += DR_GROUP_GAP (next_stmt_info);
     417              :     }
     418       986447 :   while (next_stmt_info);
     419              : 
     420              :   return -1;
     421              : }
     422              : 
     423              : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
     424              :    using the method implemented by duplicate_and_interleave.  Return true
     425              :    if so, returning the number of intermediate vectors in *NVECTORS_OUT
     426              :    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
     427              :    (if nonnull).  */
     428              : 
     429              : bool
     430            0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
     431              :                                 tree elt_type, unsigned int *nvectors_out,
     432              :                                 tree *vector_type_out,
     433              :                                 tree *permutes)
     434              : {
     435            0 :   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
     436            0 :   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
     437            0 :     return false;
     438              : 
     439            0 :   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
     440            0 :   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
     441            0 :   unsigned int nvectors = 1;
     442            0 :   for (;;)
     443              :     {
     444            0 :       scalar_int_mode int_mode;
     445            0 :       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
     446            0 :       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
     447              :         {
     448              :           /* Get the natural vector type for this SLP group size.  */
     449            0 :           tree int_type = build_nonstandard_integer_type
     450            0 :             (GET_MODE_BITSIZE (int_mode), 1);
     451            0 :           tree vector_type
     452            0 :             = get_vectype_for_scalar_type (vinfo, int_type, count);
     453            0 :           poly_int64 half_nelts;
     454            0 :           if (vector_type
     455            0 :               && VECTOR_MODE_P (TYPE_MODE (vector_type))
     456            0 :               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
     457              :                            GET_MODE_SIZE (base_vector_mode))
     458            0 :               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
     459              :                              2, &half_nelts))
     460              :             {
     461              :               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
     462              :                  together into elements of type INT_TYPE and using the result
     463              :                  to build NVECTORS vectors.  */
     464            0 :               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
     465            0 :               vec_perm_builder sel1 (nelts, 2, 3);
     466            0 :               vec_perm_builder sel2 (nelts, 2, 3);
     467              : 
     468            0 :               for (unsigned int i = 0; i < 3; ++i)
     469              :                 {
     470            0 :                   sel1.quick_push (i);
     471            0 :                   sel1.quick_push (i + nelts);
     472            0 :                   sel2.quick_push (half_nelts + i);
     473            0 :                   sel2.quick_push (half_nelts + i + nelts);
     474              :                 }
     475            0 :               vec_perm_indices indices1 (sel1, 2, nelts);
     476            0 :               vec_perm_indices indices2 (sel2, 2, nelts);
     477            0 :               machine_mode vmode = TYPE_MODE (vector_type);
     478            0 :               if (can_vec_perm_const_p (vmode, vmode, indices1)
     479            0 :                   && can_vec_perm_const_p (vmode, vmode, indices2))
     480              :                 {
     481            0 :                   if (nvectors_out)
     482            0 :                     *nvectors_out = nvectors;
     483            0 :                   if (vector_type_out)
     484            0 :                     *vector_type_out = vector_type;
     485            0 :                   if (permutes)
     486              :                     {
     487            0 :                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
     488              :                                                                 indices1);
     489            0 :                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
     490              :                                                                 indices2);
     491              :                     }
     492            0 :                   return true;
     493              :                 }
     494            0 :             }
     495              :         }
     496            0 :       if (!multiple_p (elt_bytes, 2, &elt_bytes))
     497              :         return false;
     498            0 :       nvectors *= 2;
     499              :       /* We need to be able to fuse COUNT / NVECTORS elements together.  */
     500            0 :       if (!multiple_p (count, nvectors))
     501              :         return false;
     502              :     }
     503              : }
     504              : 
     505              : /* Return true if DTA and DTB match.  */
     506              : 
     507              : static bool
     508     16777999 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
     509              : {
     510     16777999 :   return (dta == dtb
     511       342132 :           || ((dta == vect_external_def || dta == vect_constant_def)
     512       214103 :               && (dtb == vect_external_def || dtb == vect_constant_def)));
     513              : }
     514              : 
     515              : #define GATHER_SCATTER_OFFSET (-3)
     516              : 
     517              : static const int no_arg_map[] = { 0 };
     518              : static const int arg0_map[] = { 1, 0 };
     519              : static const int arg2_map[] = { 1, 2 };
     520              : static const int arg2_arg3_map[] = { 2, 2, 3 };
     521              : static const int arg2_arg4_map[] = { 2, 2, 4 };
     522              : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
     523              : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
     524              : static const int arg3_arg2_map[] = { 2, 3, 2 };
     525              : static const int op1_op0_map[] = { 2, 1, 0 };
     526              : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
     527              : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
     528              : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
     529              : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
     530              : static const int mask_call_maps[6][7] = {
     531              :   { 1, 1, },
     532              :   { 2, 1, 2, },
     533              :   { 3, 1, 2, 3, },
     534              :   { 4, 1, 2, 3, 4, },
     535              :   { 5, 1, 2, 3, 4, 5, },
     536              :   { 6, 1, 2, 3, 4, 5, 6 },
     537              : };
     538              : 
     539              : /* For most SLP statements, there is a one-to-one mapping between
     540              :    gimple arguments and child nodes.  If that is not true for STMT,
     541              :    return an array that contains:
     542              : 
     543              :    - the number of child nodes, followed by
     544              :    - for each child node, the index of the argument associated with that node.
     545              :      The special index -1 is the first operand of an embedded comparison and
     546              :      the special index -2 is the second operand of an embedded comparison.
     547              :      The special indes -3 is the offset of a gather as analyzed by
     548              :      vect_check_gather_scatter.
     549              : 
     550              :    SWAP is as for vect_get_and_check_slp_defs.  */
     551              : 
     552              : static const int *
     553     18783496 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
     554              :                       unsigned char swap = 0)
     555              : {
     556     18783496 :   if (auto assign = dyn_cast<const gassign *> (stmt))
     557              :     {
     558     17643968 :       if (gimple_assign_rhs_code (assign) == COND_EXPR
     559     17643968 :           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
     560            0 :         gcc_unreachable ();
     561     17643968 :       if ((TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
     562     16357246 :            || commutative_tree_code (gimple_assign_rhs_code (assign)))
     563     26169049 :           && swap)
     564              :         return op1_op0_map;
     565     17603664 :       if (gather_scatter_p)
     566        42215 :         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
     567        42215 :                 ? off_op0_map : off_map);
     568              :     }
     569     18700977 :   gcc_assert (!swap);
     570     18700977 :   if (auto call = dyn_cast<const gcall *> (stmt))
     571              :     {
     572       139291 :       if (gimple_call_internal_p (call))
     573        73317 :         switch (gimple_call_internal_fn (call))
     574              :           {
     575        12168 :           case IFN_MASK_LOAD:
     576        20116 :             return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
     577              : 
     578            0 :           case IFN_GATHER_LOAD:
     579            0 :             return arg2_map;
     580              : 
     581            0 :           case IFN_MASK_GATHER_LOAD:
     582            0 :           case IFN_MASK_LEN_GATHER_LOAD:
     583            0 :             return arg2_arg5_arg6_map;
     584              : 
     585            0 :           case IFN_SCATTER_STORE:
     586            0 :             return arg2_arg4_map;
     587              : 
     588            0 :           case IFN_MASK_SCATTER_STORE:
     589            0 :           case IFN_MASK_LEN_SCATTER_STORE:
     590            0 :             return arg2_arg4_arg5_map;
     591              : 
     592         6227 :           case IFN_MASK_STORE:
     593        11178 :             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
     594              : 
     595          988 :           case IFN_MASK_CALL:
     596          988 :             {
     597          988 :               unsigned nargs = gimple_call_num_args (call);
     598          988 :               if (nargs >= 2 && nargs <= 7)
     599          988 :                 return mask_call_maps[nargs-2];
     600              :               else
     601              :                 return nullptr;
     602              :             }
     603              : 
     604          140 :           case IFN_CLZ:
     605          140 :           case IFN_CTZ:
     606          140 :             return arg0_map;
     607              : 
     608         6306 :           case IFN_GOMP_SIMD_LANE:
     609         6306 :             return no_arg_map;
     610              : 
     611              :           default:
     612              :             break;
     613              :           }
     614              :     }
     615              :   return nullptr;
     616              : }
     617              : 
     618              : /* Return the SLP node child index for operand OP of STMT.  */
     619              : 
     620              : int
     621      1322360 : vect_slp_child_index_for_operand (const gimple *stmt, int op,
     622              :                                   bool gather_scatter_p)
     623              : {
     624      1322360 :   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
     625      1322360 :   if (!opmap)
     626              :     return op;
     627        18015 :   for (int i = 1; i < 1 + opmap[0]; ++i)
     628        18015 :     if (opmap[i] == op)
     629         9882 :       return i - 1;
     630            0 :   gcc_unreachable ();
     631              : }
     632              : 
     633              : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
     634              :    they are of a valid type and that they match the defs of the first stmt of
     635              :    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
     636              :    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
     637              :    indicates swap is required for cond_expr stmts.  Specifically, SWAP
     638              :    is 1 if STMT is cond and operands of comparison need to be swapped;
     639              :    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
     640              : 
     641              :    If there was a fatal error return -1; if the error could be corrected by
     642              :    swapping operands of father node of this one, return 1; if everything is
     643              :    ok return 0.  */
     644              : static int
     645     12201219 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
     646              :                              bool *skip_args,
     647              :                              vec<stmt_vec_info> stmts, unsigned stmt_num,
     648              :                              vec<slp_oprnd_info> *oprnds_info)
     649              : {
     650     12201219 :   stmt_vec_info stmt_info = stmts[stmt_num];
     651     12201219 :   tree oprnd;
     652     12201219 :   unsigned int i, number_of_oprnds;
     653     12201219 :   enum vect_def_type dt = vect_uninitialized_def;
     654     12201219 :   slp_oprnd_info oprnd_info;
     655     12201219 :   gather_scatter_info gs_info;
     656     12201219 :   unsigned int gs_op = -1u;
     657     12201219 :   unsigned int commutative_op = -1U;
     658     12201219 :   bool first = stmt_num == 0;
     659              : 
     660     12201219 :   if (!stmt_info)
     661              :     {
     662            0 :       for (auto oi : *oprnds_info)
     663              :         {
     664            0 :           oi->def_stmts.quick_push (NULL);
     665            0 :           oi->ops.quick_push (NULL_TREE);
     666              :         }
     667              :       return 0;
     668              :     }
     669              : 
     670     12201219 :   if (!is_a<gcall *> (stmt_info->stmt)
     671              :       && !is_a<gassign *> (stmt_info->stmt)
     672              :       && !is_a<gphi *> (stmt_info->stmt))
     673              :     return -1;
     674              : 
     675     12201219 :   number_of_oprnds = gimple_num_args (stmt_info->stmt);
     676     12201219 :   const int *map
     677     24402438 :     = vect_get_operand_map (stmt_info->stmt,
     678     12201219 :                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
     679     12201219 :   if (map)
     680        69668 :     number_of_oprnds = *map++;
     681     12201219 :   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
     682              :     {
     683        40072 :       if (gimple_call_internal_p (stmt))
     684              :         {
     685        24252 :           internal_fn ifn = gimple_call_internal_fn (stmt);
     686        24252 :           commutative_op = first_commutative_argument (ifn);
     687        24252 :           if (internal_gather_scatter_fn_p (ifn))
     688              :             {
     689            0 :               vect_describe_gather_scatter_call
     690            0 :                 (stmt_info,
     691            0 :                  first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
     692            0 :               if (first)
     693            0 :                 (*oprnds_info)[0]->first_gs_p = true;
     694              :               gs_op = 0;
     695              :             }
     696              :         }
     697              :     }
     698     12161147 :   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
     699              :     {
     700     14218248 :       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
     701      8176202 :         commutative_op = 0;
     702              :     }
     703              : 
     704     12201219 :   bool swapped = (swap != 0);
     705     12201219 :   bool backedge = false;
     706     12201219 :   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
     707     33799340 :   for (i = 0; i < number_of_oprnds; i++)
     708              :     {
     709     21599236 :       oprnd_info = (*oprnds_info)[i];
     710     21599236 :       int opno = map ? map[i] : int (i);
     711     21599236 :       if (opno == GATHER_SCATTER_OFFSET)
     712              :         {
     713        22050 :           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
     714        22050 :           if (!is_a <loop_vec_info> (vinfo)
     715        22050 :               || !vect_check_gather_scatter (stmt_info, vectype,
     716              :                                              as_a <loop_vec_info> (vinfo),
     717              :                                              first ? &oprnd_info->first_gs_info
     718              :                                              : &gs_info))
     719         1115 :             return -1;
     720              : 
     721        22050 :           if (first)
     722              :             {
     723        21813 :               oprnd_info->first_gs_p = true;
     724        21813 :               oprnd = oprnd_info->first_gs_info.offset;
     725              :             }
     726              :           else
     727              :             {
     728          237 :               gs_op = i;
     729          237 :               oprnd = gs_info.offset;
     730              :             }
     731              :         }
     732     21577186 :       else if (opno < 0)
     733            0 :         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
     734              :       else
     735              :         {
     736     21577186 :           oprnd = gimple_arg (stmt_info->stmt, opno);
     737     21577186 :           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
     738              :             {
     739      1088608 :               edge e = gimple_phi_arg_edge (stmt, opno);
     740      2177216 :               backedge = (is_a <bb_vec_info> (vinfo)
     741      1626956 :                           ? e->flags & EDGE_DFS_BACK
     742       538348 :                           : dominated_by_p (CDI_DOMINATORS, e->src,
     743       538348 :                                             gimple_bb (stmt_info->stmt)));
     744              :             }
     745              :         }
     746     21599236 :       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
     747         2650 :         oprnd = TREE_OPERAND (oprnd, 0);
     748              : 
     749     21599236 :       stmt_vec_info def_stmt_info;
     750     21599236 :       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
     751              :         {
     752          963 :           if (dump_enabled_p ())
     753            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     754              :                              "Build SLP failed: can't analyze def for %T\n",
     755              :                              oprnd);
     756              : 
     757          963 :           return -1;
     758              :         }
     759              : 
     760     21598273 :       if (skip_args[i])
     761              :         {
     762       444785 :           oprnd_info->def_stmts.quick_push (NULL);
     763       444785 :           oprnd_info->ops.quick_push (NULL_TREE);
     764       444785 :           oprnd_info->first_dt = vect_uninitialized_def;
     765       444785 :           continue;
     766              :         }
     767              : 
     768     21153488 :       oprnd_info->def_stmts.quick_push (def_stmt_info);
     769     21153488 :       oprnd_info->ops.quick_push (oprnd);
     770              : 
     771     21153488 :       if (def_stmt_info
     772     21153488 :           && is_pattern_stmt_p (def_stmt_info))
     773              :         {
     774       344299 :           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
     775              :               != def_stmt_info)
     776       247311 :             oprnd_info->any_pattern = true;
     777              :           else
     778              :             /* If we promote this to external use the original stmt def.  */
     779        96988 :             oprnd_info->ops.last ()
     780       193976 :               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
     781              :         }
     782              : 
     783              :       /* If there's a extern def on a backedge make sure we can
     784              :          code-generate at the region start.
     785              :          ???  This is another case that could be fixed by adjusting
     786              :          how we split the function but at the moment we'd have conflicting
     787              :          goals there.  */
     788     21153488 :       if (backedge
     789       126982 :           && dts[i] == vect_external_def
     790          173 :           && is_a <bb_vec_info> (vinfo)
     791          173 :           && TREE_CODE (oprnd) == SSA_NAME
     792          152 :           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
     793     21153640 :           && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
     794          152 :                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
     795              :         {
     796          152 :           if (dump_enabled_p ())
     797            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     798              :                              "Build SLP failed: extern def %T only defined "
     799              :                              "on backedge\n", oprnd);
     800          152 :           return -1;
     801              :         }
     802              : 
     803     21153336 :       if (first)
     804              :         {
     805      4261052 :           tree type = TREE_TYPE (oprnd);
     806      4261052 :           dt = dts[i];
     807              : 
     808              :           /* For the swapping logic below force vect_reduction_def
     809              :              for the reduction op in a SLP reduction group.  */
     810      4261052 :           if (!STMT_VINFO_DATA_REF (stmt_info)
     811      3175706 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     812         3288 :               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
     813      4262672 :               && def_stmt_info)
     814         1620 :             dts[i] = dt = vect_reduction_def;
     815              : 
     816              :           /* Check the types of the definition.  */
     817      4261052 :           switch (dt)
     818              :             {
     819      4261052 :             case vect_external_def:
     820      4261052 :             case vect_constant_def:
     821      4261052 :             case vect_internal_def:
     822      4261052 :             case vect_reduction_def:
     823      4261052 :             case vect_double_reduction_def:
     824      4261052 :             case vect_induction_def:
     825      4261052 :             case vect_nested_cycle:
     826      4261052 :             case vect_first_order_recurrence:
     827      4261052 :               break;
     828              : 
     829            0 :             default:
     830              :               /* FORNOW: Not supported.  */
     831            0 :               if (dump_enabled_p ())
     832            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     833              :                                  "Build SLP failed: illegal type of def %T\n",
     834              :                                  oprnd);
     835            0 :               return -1;
     836              :             }
     837              : 
     838      4261052 :           oprnd_info->first_dt = dt;
     839      4261052 :           oprnd_info->first_op_type = type;
     840              :         }
     841              :     }
     842     12200104 :   if (first)
     843              :     return 0;
     844              : 
     845              :   /* Now match the operand definition types to that of the first stmt.  */
     846     25852480 :   for (i = 0; i < number_of_oprnds;)
     847              :     {
     848     16888230 :       if (skip_args[i])
     849              :         {
     850        27772 :           ++i;
     851        27772 :           continue;
     852              :         }
     853              : 
     854     16860458 :       oprnd_info = (*oprnds_info)[i];
     855     16860458 :       dt = dts[i];
     856     16860458 :       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
     857     16860458 :       oprnd = oprnd_info->ops[stmt_num];
     858     16860458 :       tree type = TREE_TYPE (oprnd);
     859              : 
     860     16860458 :       if (!types_compatible_p (oprnd_info->first_op_type, type))
     861              :         {
     862        88803 :           if (dump_enabled_p ())
     863          107 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     864              :                              "Build SLP failed: different operand types\n");
     865        88803 :           return 1;
     866              :         }
     867              : 
     868     16771655 :       if ((gs_op == i) != oprnd_info->first_gs_p)
     869              :         {
     870            0 :           if (dump_enabled_p ())
     871            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     872              :                              "Build SLP failed: mixed gather and non-gather\n");
     873            0 :           return 1;
     874              :         }
     875     16771655 :       else if (gs_op == i)
     876              :         {
     877          207 :           if (!operand_equal_p (oprnd_info->first_gs_info.base,
     878          207 :                                 gs_info.base))
     879              :             {
     880           16 :               if (dump_enabled_p ())
     881            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     882              :                                  "Build SLP failed: different gather base\n");
     883           16 :               return 1;
     884              :             }
     885          191 :           if (oprnd_info->first_gs_info.scale != gs_info.scale)
     886              :             {
     887            8 :               if (dump_enabled_p ())
     888            2 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     889              :                                  "Build SLP failed: different gather scale\n");
     890            8 :               return 1;
     891              :             }
     892              :         }
     893              : 
     894              :       /* Not first stmt of the group, check that the def-stmt/s match
     895              :          the def-stmt/s of the first stmt.  Allow different definition
     896              :          types for reduction chains: the first stmt must be a
     897              :          vect_reduction_def (a phi node), and the rest
     898              :          end in the reduction chain.  */
     899     16771631 :       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
     900       284804 :            && !(oprnd_info->first_dt == vect_reduction_def
     901         2777 :                 && !STMT_VINFO_DATA_REF (stmt_info)
     902         2777 :                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     903         2767 :                 && def_stmt_info
     904         2767 :                 && !STMT_VINFO_DATA_REF (def_stmt_info)
     905         2767 :                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     906              :                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
     907     16489594 :           || (!STMT_VINFO_DATA_REF (stmt_info)
     908     15217699 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     909         5814 :               && ((!def_stmt_info
     910         5652 :                    || STMT_VINFO_DATA_REF (def_stmt_info)
     911        10379 :                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     912              :                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
     913         5814 :                   != (oprnd_info->first_dt != vect_reduction_def))))
     914              :         {
     915              :           /* Try swapping operands if we got a mismatch.  For BB
     916              :              vectorization only in case it will clearly improve things.  */
     917       283968 :           if (i == commutative_op && !swapped
     918       282037 :               && (!is_a <bb_vec_info> (vinfo)
     919         5114 :                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
     920         5114 :                                              dts[i+1])
     921         1108 :                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
     922              :                           || vect_def_types_match
     923          146 :                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
     924              :             {
     925         1931 :               if (dump_enabled_p ())
     926          144 :                 dump_printf_loc (MSG_NOTE, vect_location,
     927              :                                  "trying swapped operands\n");
     928         1931 :               std::swap (dts[i], dts[i+1]);
     929         1931 :               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
     930         1931 :                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
     931         1931 :               std::swap ((*oprnds_info)[i]->ops[stmt_num],
     932         1931 :                          (*oprnds_info)[i+1]->ops[stmt_num]);
     933              :               /* After swapping some operands we lost track whether an
     934              :                  operand has any pattern defs so be conservative here.  */
     935         1931 :               if ((*oprnds_info)[i]->any_pattern
     936         1931 :                   || (*oprnds_info)[i+1]->any_pattern)
     937            4 :                 (*oprnds_info)[i]->any_pattern
     938            2 :                   = (*oprnds_info)[i+1]->any_pattern = true;
     939         1931 :               swapped = true;
     940         1931 :               continue;
     941              :             }
     942              : 
     943       280106 :           if (is_a <bb_vec_info> (vinfo)
     944       269619 :               && !oprnd_info->any_pattern
     945       549487 :               && number_of_oprnds > 1)
     946              :             {
     947              :               /* Now for commutative ops we should see whether we can
     948              :                  make the other operand matching.  */
     949       103988 :               if (dump_enabled_p ())
     950          149 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     951              :                                  "treating operand as external\n");
     952       103988 :               oprnd_info->first_dt = dt = vect_external_def;
     953              :             }
     954              :           else
     955              :             {
     956       176118 :               if (dump_enabled_p ())
     957          406 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     958              :                                  "Build SLP failed: different types\n");
     959       176118 :               return 1;
     960              :             }
     961              :         }
     962              : 
     963              :       /* Make sure to demote the overall operand to external.  */
     964     16593582 :       if (dt == vect_external_def)
     965       330363 :         oprnd_info->first_dt = vect_external_def;
     966              :       /* For a SLP reduction chain we want to duplicate the reduction to
     967              :          each of the chain members.  That gets us a sane SLP graph (still
     968              :          the stmts are not 100% correct wrt the initial values).  */
     969     16263219 :       else if ((dt == vect_internal_def
     970     16263219 :                 || dt == vect_reduction_def)
     971     15359454 :                && oprnd_info->first_dt == vect_reduction_def
     972        64716 :                && !STMT_VINFO_DATA_REF (stmt_info)
     973        64716 :                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     974         2767 :                && !STMT_VINFO_DATA_REF (def_stmt_info)
     975     16265986 :                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     976              :                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
     977              :         {
     978         2767 :           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
     979         2767 :           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
     980              :         }
     981              : 
     982     16593582 :       ++i;
     983              :     }
     984              : 
     985              :   /* Swap operands.  */
     986      8964250 :   if (swapped)
     987              :     {
     988        39978 :       if (dump_enabled_p ())
     989          432 :         dump_printf_loc (MSG_NOTE, vect_location,
     990              :                          "swapped operands to match def types in %G",
     991              :                          stmt_info->stmt);
     992              :     }
     993              : 
     994              :   return 0;
     995              : }
     996              : 
     997              : /* Return true if call statements CALL1 and CALL2 are similar enough
     998              :    to be combined into the same SLP group.  */
     999              : 
    1000              : bool
    1001        20886 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
    1002              : {
    1003        20886 :   unsigned int nargs = gimple_call_num_args (call1);
    1004        20886 :   if (nargs != gimple_call_num_args (call2))
    1005              :     return false;
    1006              : 
    1007        18950 :   auto cfn1 = gimple_call_combined_fn (call1);
    1008        18950 :   auto cfn2 = gimple_call_combined_fn (call2);
    1009        18950 :   if (cfn1 != cfn2
    1010            2 :       && (!allow_two_operators
    1011            2 :           || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
    1012            2 :                && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
    1013              :     return false;
    1014              : 
    1015        18950 :   if (gimple_call_internal_p (call1))
    1016              :     {
    1017         7084 :       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
    1018         7084 :                                TREE_TYPE (gimple_call_lhs (call2))))
    1019              :         return false;
    1020        14393 :       for (unsigned int i = 0; i < nargs; ++i)
    1021         7309 :         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
    1022         7309 :                                  TREE_TYPE (gimple_call_arg (call2, i))))
    1023              :           return false;
    1024              :     }
    1025              :   else
    1026              :     {
    1027        11866 :       if (!operand_equal_p (gimple_call_fn (call1),
    1028        11866 :                             gimple_call_fn (call2), 0))
    1029              :         return false;
    1030              : 
    1031        25848 :       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
    1032              :         return false;
    1033              :     }
    1034              : 
    1035              :   /* Check that any unvectorized arguments are equal.  */
    1036        15700 :   if (const int *map = vect_get_operand_map (call1))
    1037              :     {
    1038           15 :       unsigned int nkept = *map++;
    1039           15 :       unsigned int mapi = 0;
    1040           57 :       for (unsigned int i = 0; i < nargs; ++i)
    1041           42 :         if (mapi < nkept && map[mapi] == int (i))
    1042           27 :           mapi += 1;
    1043           15 :         else if (!operand_equal_p (gimple_call_arg (call1, i),
    1044           15 :                                    gimple_call_arg (call2, i)))
    1045              :           return false;
    1046              :     }
    1047              : 
    1048              :   return true;
    1049              : }
    1050              : 
    1051              : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
    1052              :    caller's attempt to find the vector type in STMT_INFO with the narrowest
    1053              :    element type.  Return true if VECTYPE is nonnull and if it is valid
    1054              :    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
    1055              :    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
    1056              :    vect_build_slp_tree.  */
    1057              : 
    1058              : static bool
    1059      4954791 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
    1060              :                         unsigned int group_size,
    1061              :                         tree vectype, poly_uint64 *max_nunits)
    1062              : {
    1063      4954791 :   if (!vectype)
    1064              :     {
    1065         4404 :       if (dump_enabled_p ())
    1066            7 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1067              :                          "Build SLP failed: unsupported data-type in %G\n",
    1068              :                          stmt_info->stmt);
    1069              :       /* Fatal mismatch.  */
    1070         4404 :       return false;
    1071              :     }
    1072              : 
    1073              :   /* If populating the vector type requires unrolling then fail
    1074              :      before adjusting *max_nunits for basic-block vectorization.  */
    1075      4950387 :   if (is_a <bb_vec_info> (vinfo)
    1076      4950387 :       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    1077              :     {
    1078       141278 :       if (dump_enabled_p ())
    1079           34 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1080              :                          "Build SLP failed: unrolling required "
    1081              :                          "in basic block SLP\n");
    1082              :       /* Fatal mismatch.  */
    1083       141278 :       return false;
    1084              :     }
    1085              : 
    1086              :   /* In case of multiple types we need to detect the smallest type.  */
    1087      4809109 :   vect_update_max_nunits (max_nunits, vectype);
    1088      4809109 :   return true;
    1089              : }
    1090              : 
    1091              : /* Verify if the scalar stmts STMTS are isomorphic, require data
    1092              :    permutation or are of unsupported types of operation.  Return
    1093              :    true if they are, otherwise return false and indicate in *MATCHES
    1094              :    which stmts are not isomorphic to the first one.  If MATCHES[0]
    1095              :    is false then this indicates the comparison could not be
    1096              :    carried out or the stmts will never be vectorized by SLP.
    1097              : 
    1098              :    Note COND_EXPR is possibly isomorphic to another one after swapping its
    1099              :    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
    1100              :    the first stmt by swapping the two operands of comparison; set SWAP[i]
    1101              :    to 2 if stmt I is isormorphic to the first stmt by inverting the code
    1102              :    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
    1103              :    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
    1104              : 
    1105              : static bool
    1106      5239811 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
    1107              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1108              :                        poly_uint64 *max_nunits, bool *matches,
    1109              :                        bool *two_operators, tree *node_vectype)
    1110              : {
    1111      5239811 :   unsigned int i;
    1112      5239811 :   stmt_vec_info first_stmt_info = stmts[0];
    1113      5239811 :   code_helper first_stmt_code = ERROR_MARK;
    1114      5239811 :   code_helper alt_stmt_code = ERROR_MARK;
    1115      5239811 :   code_helper first_cond_code = ERROR_MARK;
    1116      5239811 :   bool need_same_oprnds = false;
    1117      5239811 :   tree first_lhs = NULL_TREE;
    1118      5239811 :   tree first_op1 = NULL_TREE;
    1119      5239811 :   stmt_vec_info first_load = NULL, prev_first_load = NULL;
    1120      5239811 :   bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
    1121      5239811 :   bool first_stmt_phi_p = false;
    1122      5239811 :   int first_reduc_idx = -1;
    1123      5239811 :   bool maybe_soft_fail = false;
    1124      5239811 :   tree soft_fail_nunits_vectype = NULL_TREE;
    1125              : 
    1126      5239811 :   tree vectype, nunits_vectype;
    1127      5239811 :   if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
    1128              :                                        &nunits_vectype, group_size))
    1129              :     {
    1130              :       /* Fatal mismatch.  */
    1131       193781 :       matches[0] = false;
    1132       193781 :       return false;
    1133              :     }
    1134      5046030 :   if (is_a <bb_vec_info> (vinfo)
    1135      5046030 :       && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
    1136              :     {
    1137       343438 :       if (dump_enabled_p ())
    1138          290 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1139              :                          "Build SLP failed: not using single lane "
    1140              :                          "vector type %T\n", vectype);
    1141       343438 :       matches[0] = false;
    1142       343438 :       return false;
    1143              :     }
    1144              :   /* Record nunits required but continue analysis, producing matches[]
    1145              :      as if nunits was not an issue.  This allows splitting of groups
    1146              :      to happen.  */
    1147      4702592 :   if (nunits_vectype
    1148      4702592 :       && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
    1149              :                                   nunits_vectype, max_nunits))
    1150              :     {
    1151       141278 :       gcc_assert (is_a <bb_vec_info> (vinfo));
    1152       141278 :       maybe_soft_fail = true;
    1153       141278 :       soft_fail_nunits_vectype = nunits_vectype;
    1154              :     }
    1155              : 
    1156      4702592 :   gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
    1157      4702592 :   *node_vectype = vectype;
    1158              : 
    1159              :   /* For every stmt in NODE find its def stmt/s.  */
    1160      4702592 :   stmt_vec_info stmt_info;
    1161     20939920 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    1162              :     {
    1163     16396246 :       bool ldst_p = false;
    1164     16396246 :       bool ldst_masklen_p = false;
    1165     16396246 :       bool phi_p = false;
    1166     16396246 :       code_helper rhs_code = ERROR_MARK;
    1167              : 
    1168     16396246 :       swap[i] = 0;
    1169     16396246 :       matches[i] = false;
    1170     16396246 :       if (!stmt_info)
    1171              :         {
    1172        39729 :           matches[i] = true;
    1173     16277057 :           continue;
    1174              :         }
    1175              : 
    1176     16356517 :       gimple *stmt = stmt_info->stmt;
    1177     16356517 :       if (dump_enabled_p ())
    1178       213305 :         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
    1179              : 
    1180              :       /* Fail to vectorize statements marked as unvectorizable, throw
    1181              :          or are volatile.  */
    1182     16356517 :       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
    1183     16169231 :           || stmt_can_throw_internal (cfun, stmt)
    1184     31802764 :           || gimple_has_volatile_ops (stmt))
    1185              :         {
    1186       192770 :           if (dump_enabled_p ())
    1187          199 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1188              :                              "Build SLP failed: unvectorizable statement %G",
    1189              :                              stmt);
    1190              :           /* ???  For BB vectorization we want to commutate operands in a way
    1191              :              to shuffle all unvectorizable defs into one operand and have
    1192              :              the other still vectorized.  The following doesn't reliably
    1193              :              work for this though but it's the easiest we can do here.  */
    1194       192770 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1195        62854 :             continue;
    1196              :           /* Fatal mismatch.  */
    1197       129916 :           matches[0] = false;
    1198       129916 :           return false;
    1199              :         }
    1200              : 
    1201     16163747 :       gcall *call_stmt = dyn_cast <gcall *> (stmt);
    1202     16163747 :       tree lhs = gimple_get_lhs (stmt);
    1203     16163747 :       if (lhs == NULL_TREE && !call_stmt)
    1204              :         {
    1205           36 :           if (dump_enabled_p ())
    1206            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1207              :                              "Build SLP failed: not GIMPLE_ASSIGN nor "
    1208              :                              "GIMPLE_CALL %G", stmt);
    1209           36 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1210           36 :             continue;
    1211              :           /* Fatal mismatch.  */
    1212            0 :           matches[0] = false;
    1213            0 :           return false;
    1214              :         }
    1215              : 
    1216     16163711 :       if (call_stmt)
    1217              :         {
    1218        92843 :           combined_fn cfn = gimple_call_combined_fn (call_stmt);
    1219        92843 :           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
    1220        49465 :             rhs_code = cfn;
    1221              :           else
    1222              :             rhs_code = CALL_EXPR;
    1223              : 
    1224        92843 :           if (cfn == CFN_GATHER_LOAD
    1225        92843 :               || cfn == CFN_SCATTER_STORE)
    1226              :             ldst_p = true;
    1227              :           else if (cfn == CFN_MASK_LOAD
    1228              :                    || cfn == CFN_MASK_GATHER_LOAD
    1229              :                    || cfn == CFN_MASK_LEN_GATHER_LOAD
    1230              :                    || cfn == CFN_MASK_SCATTER_STORE
    1231              :                    || cfn == CFN_MASK_LEN_SCATTER_STORE)
    1232              :             {
    1233              :               ldst_p = true;
    1234              :               ldst_masklen_p = true;
    1235              :             }
    1236              :           else if (cfn == CFN_MASK_STORE)
    1237              :             {
    1238              :               ldst_p = true;
    1239              :               ldst_masklen_p = true;
    1240              :               rhs_code = CFN_MASK_STORE;
    1241              :             }
    1242              :           else if (cfn == CFN_GOMP_SIMD_LANE)
    1243              :             ;
    1244        83700 :           else if ((cfn != CFN_LAST
    1245              :                     && cfn != CFN_MASK_CALL
    1246        40322 :                     && internal_fn_p (cfn)
    1247        31185 :                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
    1248        83626 :                    || gimple_call_tail_p (call_stmt)
    1249        83626 :                    || gimple_call_noreturn_p (call_stmt)
    1250       167326 :                    || gimple_call_chain (call_stmt))
    1251              :             {
    1252          423 :               if (dump_enabled_p ())
    1253           13 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1254              :                                  "Build SLP failed: unsupported call type %G",
    1255              :                                  (gimple *) call_stmt);
    1256          423 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1257           62 :                 continue;
    1258              :               /* Fatal mismatch.  */
    1259          361 :               matches[0] = false;
    1260          361 :               return false;
    1261              :             }
    1262              :         }
    1263     16070868 :       else if (gimple_code (stmt) == GIMPLE_PHI)
    1264              :         {
    1265              :           rhs_code = ERROR_MARK;
    1266              :           phi_p = true;
    1267              :         }
    1268              :       else
    1269              :         {
    1270     15347884 :           rhs_code = gimple_assign_rhs_code (stmt);
    1271     15347884 :           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
    1272              :         }
    1273              : 
    1274              :       /* Check the operation.  */
    1275     16163288 :       if (i == 0)
    1276              :         {
    1277      4572315 :           first_lhs = lhs;
    1278      4572315 :           first_stmt_code = rhs_code;
    1279      4572315 :           first_stmt_ldst_p = ldst_p;
    1280      4572315 :           first_stmt_ldst_masklen_p = ldst_masklen_p;
    1281      4572315 :           first_stmt_phi_p = phi_p;
    1282      4572315 :           first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
    1283              : 
    1284              :           /* Shift arguments should be equal in all the packed stmts for a
    1285              :              vector shift with scalar shift operand.  */
    1286      4572315 :           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
    1287      4449936 :               || rhs_code == LROTATE_EXPR
    1288      9022209 :               || rhs_code == RROTATE_EXPR)
    1289              :             {
    1290              :               /* First see if we have a vector/vector shift.  */
    1291       122622 :               if (!directly_supported_p (rhs_code, vectype, optab_vector))
    1292              :                 {
    1293              :                   /* No vector/vector shift, try for a vector/scalar shift.  */
    1294       114562 :                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
    1295              :                     {
    1296         9419 :                       if (dump_enabled_p ())
    1297          375 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1298              :                                          "Build SLP failed: "
    1299              :                                          "op not supported by target.\n");
    1300         9419 :                       if (is_a <bb_vec_info> (vinfo) && i != 0)
    1301              :                         continue;
    1302              :                       /* Fatal mismatch.  */
    1303         9419 :                       matches[0] = false;
    1304         9419 :                       return false;
    1305              :                     }
    1306       105143 :                   need_same_oprnds = true;
    1307       105143 :                   first_op1 = gimple_assign_rhs2 (stmt);
    1308              :                 }
    1309              :             }
    1310      4449693 :           else if (rhs_code == WIDEN_LSHIFT_EXPR)
    1311              :             {
    1312            0 :               need_same_oprnds = true;
    1313            0 :               first_op1 = gimple_assign_rhs2 (stmt);
    1314              :             }
    1315      4449693 :           else if (!ldst_p
    1316      4449693 :                    && rhs_code == BIT_FIELD_REF)
    1317              :             {
    1318         5743 :               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
    1319         5743 :               if (!is_a <bb_vec_info> (vinfo)
    1320         5617 :                   || TREE_CODE (vec) != SSA_NAME
    1321              :                   /* When the element types are not compatible we pun the
    1322              :                      source to the target vectype which requires equal size.  */
    1323        11348 :                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
    1324         4890 :                        || !types_compatible_p (TREE_TYPE (vectype),
    1325         4890 :                                                TREE_TYPE (TREE_TYPE (vec))))
    1326         1031 :                       && !operand_equal_p (TYPE_SIZE (vectype),
    1327         1031 :                                            TYPE_SIZE (TREE_TYPE (vec)))))
    1328              :                 {
    1329          781 :                   if (dump_enabled_p ())
    1330            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1331              :                                      "Build SLP failed: "
    1332              :                                      "BIT_FIELD_REF not supported\n");
    1333              :                   /* Fatal mismatch.  */
    1334          781 :                   matches[0] = false;
    1335          781 :                   return false;
    1336              :                 }
    1337              :             }
    1338      4443950 :           else if (rhs_code == CFN_DIV_POW2)
    1339              :             {
    1340            0 :               need_same_oprnds = true;
    1341            0 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1342              :             }
    1343      4443950 :           else if (rhs_code == CFN_GOMP_SIMD_LANE)
    1344              :             {
    1345         3153 :               need_same_oprnds = true;
    1346         3153 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1347              :             }
    1348              :         }
    1349              :       else
    1350              :         {
    1351     11591305 :           if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1352              :               /* For SLP reduction groups the index isn't necessarily
    1353              :                  uniform but only that of the first stmt matters.  */
    1354         1640 :               && !(first_reduc_idx != -1
    1355         1640 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1356         1640 :                    && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
    1357     11590973 :               && !(first_reduc_idx != -1
    1358          898 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1359          898 :                    && rhs_code.is_tree_code ()
    1360          898 :                    && commutative_tree_code (tree_code (rhs_code))
    1361          704 :                    && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
    1362              :             {
    1363          332 :               if (dump_enabled_p ())
    1364              :                 {
    1365           12 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1366              :                                    "Build SLP failed: different reduc_idx "
    1367              :                                    "%d instead of %d in %G",
    1368              :                                    STMT_VINFO_REDUC_IDX (stmt_info),
    1369              :                                    first_reduc_idx, stmt);
    1370              :                 }
    1371              :               /* Mismatch.  */
    1372          332 :               continue;
    1373              :             }
    1374     11590641 :           if (!ldst_p
    1375      9167541 :               && first_stmt_code != rhs_code
    1376     12974558 :               && alt_stmt_code == ERROR_MARK)
    1377              :             alt_stmt_code = rhs_code;
    1378     12954586 :           if ((!ldst_p
    1379      9167541 :                && first_stmt_code != rhs_code
    1380      1383917 :                && (first_stmt_code != IMAGPART_EXPR
    1381          127 :                    || rhs_code != REALPART_EXPR)
    1382      1383897 :                && (first_stmt_code != REALPART_EXPR
    1383          458 :                    || rhs_code != IMAGPART_EXPR)
    1384              :                /* Handle mismatches in plus/minus by computing both
    1385              :                   and merging the results.  */
    1386      1383886 :                && !((((first_stmt_code == PLUS_EXPR
    1387      1287169 :                        || first_stmt_code == MINUS_EXPR)
    1388       116616 :                       && (alt_stmt_code == PLUS_EXPR
    1389       107822 :                           || alt_stmt_code == MINUS_EXPR))
    1390      1361366 :                      || ((first_stmt_code == CFN_FMA
    1391      1361364 :                           || first_stmt_code == CFN_FMS)
    1392            2 :                          && (alt_stmt_code == CFN_FMA
    1393            2 :                              || alt_stmt_code == CFN_FMS)))
    1394        22522 :                     && rhs_code == alt_stmt_code)
    1395      1401243 :                && !(first_stmt_code.is_tree_code ()
    1396      1285846 :                     && rhs_code.is_tree_code ()
    1397      1193189 :                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
    1398              :                         == tcc_comparison)
    1399       127701 :                     && (swap_tree_comparison (tree_code (first_stmt_code))
    1400       127701 :                         == tree_code (rhs_code))
    1401              :                     && (first_reduc_idx == -1
    1402            0 :                         || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
    1403              :               || (ldst_p
    1404      4846200 :                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    1405      2423100 :                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
    1406              :               || (ldst_p
    1407      2381129 :                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1408      2381129 :                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
    1409     10226838 :               || first_stmt_ldst_p != ldst_p
    1410     10226704 :               || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
    1411     21817337 :               || first_stmt_phi_p != phi_p)
    1412              :             {
    1413      1363945 :               if (dump_enabled_p ())
    1414              :                 {
    1415         2845 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1416              :                                    "Build SLP failed: different operation "
    1417              :                                    "in stmt %G", stmt);
    1418         2845 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1419              :                                    "original stmt %G", first_stmt_info->stmt);
    1420              :                 }
    1421              :               /* Mismatch.  */
    1422      1363945 :               continue;
    1423              :             }
    1424              : 
    1425     10229059 :           if (!ldst_p
    1426      7845697 :               && first_stmt_code == BIT_FIELD_REF
    1427     10232499 :               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
    1428         5803 :                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
    1429              :             {
    1430         2363 :               if (dump_enabled_p ())
    1431           40 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1432              :                                  "Build SLP failed: different BIT_FIELD_REF "
    1433              :                                  "arguments in %G", stmt);
    1434              :               /* Mismatch.  */
    1435         2363 :               continue;
    1436              :             }
    1437              : 
    1438     10224333 :           if (call_stmt
    1439        21718 :               && first_stmt_code != CFN_MASK_LOAD
    1440     10245565 :               && first_stmt_code != CFN_MASK_STORE)
    1441              :             {
    1442        20886 :               if (!is_a <gcall *> (stmts[0]->stmt)
    1443        20886 :                   || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
    1444              :                                           call_stmt, true))
    1445              :                 {
    1446         5186 :                   if (dump_enabled_p ())
    1447            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1448              :                                      "Build SLP failed: different calls in %G",
    1449              :                                      stmt);
    1450              :                   /* Mismatch.  */
    1451         5186 :                   continue;
    1452              :                 }
    1453              :             }
    1454              : 
    1455     10048605 :           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
    1456     10908612 :               && (gimple_bb (first_stmt_info->stmt)
    1457       860007 :                   != gimple_bb (stmt_info->stmt)))
    1458              :             {
    1459        27078 :               if (dump_enabled_p ())
    1460            8 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1461              :                                  "Build SLP failed: different BB for PHI "
    1462              :                                  "or possibly trapping operation in %G", stmt);
    1463              :               /* Mismatch.  */
    1464        27078 :               continue;
    1465              :             }
    1466              : 
    1467     10192069 :           if (need_same_oprnds)
    1468              :             {
    1469        53025 :               tree other_op1 = gimple_arg (stmt, 1);
    1470        53025 :               if (!operand_equal_p (first_op1, other_op1, 0))
    1471              :                 {
    1472         6951 :                   if (dump_enabled_p ())
    1473          123 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1474              :                                      "Build SLP failed: different shift "
    1475              :                                      "arguments in %G", stmt);
    1476              :                   /* Mismatch.  */
    1477         6951 :                   continue;
    1478              :                 }
    1479              :             }
    1480              : 
    1481     10185855 :           if (first_lhs
    1482     10185118 :               && lhs
    1483     10185118 :               && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
    1484              :             {
    1485          737 :               if (dump_enabled_p ())
    1486            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1487              :                                  "Build SLP failed: different vector type "
    1488              :                                  "in %G", stmt);
    1489              :               /* Mismatch.  */
    1490          737 :               continue;
    1491              :             }
    1492              :         }
    1493              : 
    1494              :       /* Grouped store or load.  */
    1495     14746496 :       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    1496              :         {
    1497      3729985 :           gcc_assert (ldst_p);
    1498      3729985 :           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
    1499              :             {
    1500              :               /* Store.  */
    1501      2963851 :               gcc_assert (rhs_code == CFN_MASK_STORE
    1502              :                           || REFERENCE_CLASS_P (lhs)
    1503              :                           || DECL_P (lhs));
    1504              :             }
    1505              :           else
    1506              :             {
    1507              :               /* Load.  */
    1508       766134 :               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
    1509       766134 :               if (prev_first_load)
    1510              :                 {
    1511              :                   /* Check that there are no loads from different interleaving
    1512              :                      chains in the same node.  */
    1513       339183 :                   if (prev_first_load != first_load)
    1514              :                     {
    1515        41632 :                       if (dump_enabled_p ())
    1516         1988 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
    1517              :                                          vect_location,
    1518              :                                          "Build SLP failed: different "
    1519              :                                          "interleaving chains in one node %G",
    1520              :                                          stmt);
    1521              :                       /* Mismatch.  */
    1522        41632 :                       continue;
    1523              :                     }
    1524              :                 }
    1525              :               else
    1526              :                 prev_first_load = first_load;
    1527              :            }
    1528              :         }
    1529              :       /* Non-grouped store or load.  */
    1530     11016511 :       else if (ldst_p)
    1531              :         {
    1532       705924 :           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
    1533       495068 :               && rhs_code != CFN_GATHER_LOAD
    1534              :               && rhs_code != CFN_MASK_GATHER_LOAD
    1535              :               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
    1536              :               && rhs_code != CFN_SCATTER_STORE
    1537              :               && rhs_code != CFN_MASK_SCATTER_STORE
    1538              :               && rhs_code != CFN_MASK_LEN_SCATTER_STORE
    1539       495068 :               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1540              :               /* Not grouped loads are handled as externals for BB
    1541              :                  vectorization.  For loop vectorization we can handle
    1542              :                  splats the same we handle single element interleaving.
    1543              :                  Likewise we can handle a collection of invariant refs.  */
    1544      1182658 :               && (is_a <bb_vec_info> (vinfo)
    1545       476734 :                   || (stmt_info != first_stmt_info
    1546        44304 :                   && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
    1547          157 :                       && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
    1548              :                                                          (first_stmt_info)))))))
    1549              :             {
    1550              :               /* Not grouped load.  */
    1551        43990 :               if (dump_enabled_p ())
    1552          121 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1553              :                                  "Build SLP failed: not grouped load %G", stmt);
    1554              : 
    1555        43990 :               if (i != 0)
    1556        43990 :                 continue;
    1557              :               /* Fatal mismatch.  */
    1558            0 :               matches[0] = false;
    1559            0 :               return false;
    1560              :             }
    1561              :         }
    1562              :       /* Not memory operation.  */
    1563              :       else
    1564              :         {
    1565     10310587 :           if (!phi_p
    1566      9709605 :               && rhs_code.is_tree_code ()
    1567      9667785 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
    1568      1415523 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
    1569       902340 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
    1570       854089 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
    1571        64399 :               && rhs_code != VIEW_CONVERT_EXPR
    1572              :               && rhs_code != CALL_EXPR
    1573              :               && rhs_code != BIT_FIELD_REF
    1574     10310587 :               && rhs_code != SSA_NAME)
    1575              :             {
    1576        18441 :               if (dump_enabled_p ())
    1577            7 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1578              :                                  "Build SLP failed: operation unsupported %G",
    1579              :                                  stmt);
    1580        18441 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1581            0 :                 continue;
    1582              :               /* Fatal mismatch.  */
    1583        18441 :               matches[0] = false;
    1584        18441 :               return false;
    1585              :             }
    1586              : 
    1587     10292146 :           if (rhs_code == COND_EXPR)
    1588              :             {
    1589        45840 :               tree cond_expr = gimple_assign_rhs1 (stmt);
    1590        45840 :               enum tree_code cond_code = TREE_CODE (cond_expr);
    1591        45840 :               enum tree_code swap_code = ERROR_MARK;
    1592        45840 :               enum tree_code invert_code = ERROR_MARK;
    1593              : 
    1594        45840 :               if (i == 0)
    1595        37161 :                 first_cond_code = TREE_CODE (cond_expr);
    1596         8679 :               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
    1597              :                 {
    1598            0 :                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
    1599            0 :                   swap_code = swap_tree_comparison (cond_code);
    1600            0 :                   invert_code = invert_tree_comparison (cond_code, honor_nans);
    1601              :                 }
    1602              : 
    1603        45840 :               if (first_cond_code == cond_code)
    1604              :                 ;
    1605              :               /* Isomorphic can be achieved by swapping.  */
    1606            0 :               else if (first_cond_code == swap_code)
    1607            0 :                 swap[i] = 1;
    1608              :               /* Isomorphic can be achieved by inverting.  */
    1609            0 :               else if (first_cond_code == invert_code)
    1610            0 :                 swap[i] = 2;
    1611              :               else
    1612              :                 {
    1613            0 :                   if (dump_enabled_p ())
    1614            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1615              :                                      "Build SLP failed: different"
    1616              :                                      " operation %G", stmt);
    1617              :                   /* Mismatch.  */
    1618            0 :                   continue;
    1619              :                 }
    1620              :             }
    1621              : 
    1622     10292146 :           if (i != 0
    1623      7804561 :               && first_stmt_code != rhs_code
    1624        62073 :               && first_stmt_code.is_tree_code ()
    1625        62071 :               && rhs_code.is_tree_code ()
    1626        62071 :               && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
    1627     10331861 :               && (swap_tree_comparison ((tree_code)first_stmt_code)
    1628        39715 :                   == (tree_code)rhs_code))
    1629        39715 :             swap[i] = 1;
    1630              : 
    1631     10292146 :           if (i != 0
    1632      7804561 :               && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1633         1084 :               && first_reduc_idx != -1
    1634         1084 :               && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1635         1084 :               && rhs_code.is_tree_code ()
    1636         1084 :               && commutative_tree_code (tree_code (rhs_code))
    1637     10293230 :               && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
    1638         1084 :             swap[i] = 1;
    1639              :         }
    1640              : 
    1641     14642433 :       matches[i] = true;
    1642              :     }
    1643              : 
    1644     19201097 :   for (i = 0; i < group_size; ++i)
    1645     15311347 :     if (!matches[i])
    1646              :       return false;
    1647              : 
    1648              :   /* If we allowed a two-operation SLP node verify the target can cope
    1649              :      with the permute we are going to use.  */
    1650      3889750 :   if (alt_stmt_code != ERROR_MARK
    1651      3889750 :       && (!alt_stmt_code.is_tree_code ()
    1652        51411 :           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
    1653        51411 :               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
    1654              :     {
    1655        12263 :       *two_operators = true;
    1656              :     }
    1657              : 
    1658      3889750 :   if (maybe_soft_fail)
    1659              :     {
    1660       140863 :       unsigned HOST_WIDE_INT const_nunits;
    1661       140863 :       if (!TYPE_VECTOR_SUBPARTS
    1662       140863 :             (soft_fail_nunits_vectype).is_constant (&const_nunits)
    1663       140863 :           || const_nunits > group_size)
    1664            0 :         matches[0] = false;
    1665              :       else
    1666              :         {
    1667              :           /* With constant vector elements simulate a mismatch at the
    1668              :              point we need to split.  */
    1669       140863 :           unsigned tail = group_size & (const_nunits - 1);
    1670       140863 :           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
    1671              :         }
    1672       140863 :       return false;
    1673              :     }
    1674              : 
    1675              :   return true;
    1676              : }
    1677              : 
    1678              : /* Traits for the hash_set to record failed SLP builds for a stmt set.
    1679              :    Note we never remove apart from at destruction time so we do not
    1680              :    need a special value for deleted that differs from empty.  */
    1681              : struct bst_traits
    1682              : {
    1683              :   typedef vec <stmt_vec_info> value_type;
    1684              :   typedef vec <stmt_vec_info> compare_type;
    1685              :   static inline hashval_t hash (value_type);
    1686              :   static inline bool equal (value_type existing, value_type candidate);
    1687    432043470 :   static inline bool is_empty (value_type x) { return !x.exists (); }
    1688     96138605 :   static inline bool is_deleted (value_type x) { return !x.exists (); }
    1689              :   static const bool empty_zero_p = true;
    1690            0 :   static inline void mark_empty (value_type &x) { x.release (); }
    1691              :   static inline void mark_deleted (value_type &x) { x.release (); }
    1692      8301678 :   static inline void remove (value_type &x) { x.release (); }
    1693              : };
    1694              : inline hashval_t
    1695     83776600 : bst_traits::hash (value_type x)
    1696              : {
    1697     83776600 :   inchash::hash h;
    1698    398055869 :   for (unsigned i = 0; i < x.length (); ++i)
    1699    314279269 :     h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
    1700     83776600 :   return h.end ();
    1701              : }
    1702              : inline bool
    1703     73157694 : bst_traits::equal (value_type existing, value_type candidate)
    1704              : {
    1705    219473082 :   if (existing.length () != candidate.length ())
    1706              :     return false;
    1707     74981630 :   for (unsigned i = 0; i < existing.length (); ++i)
    1708     71159687 :     if (existing[i] != candidate[i])
    1709              :       return false;
    1710              :   return true;
    1711              : }
    1712              : 
    1713              : typedef hash_map <vec <stmt_vec_info>, slp_tree,
    1714              :                   simple_hashmap_traits <bst_traits, slp_tree> >
    1715              :   scalar_stmts_to_slp_tree_map_t;
    1716              : 
    1717              : /* Release BST_MAP.  */
    1718              : 
    1719              : static void
    1720      1638392 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
    1721              : {
    1722              :   /* The map keeps a reference on SLP nodes built, release that.  */
    1723      9940070 :   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
    1724     18241748 :        it != bst_map->end (); ++it)
    1725      8301678 :     if ((*it).second)
    1726      8301678 :       vect_free_slp_tree ((*it).second);
    1727      1638392 :   delete bst_map;
    1728      1638392 : }
    1729              : 
    1730              : /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
    1731              :    but then vec::insert does memmove and that's not compatible with
    1732              :    std::pair.  */
    1733              : struct chain_op_t
    1734              : {
    1735      3645069 :   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
    1736      3645069 :       : code (code_), dt (dt_), op (op_) {}
    1737              :   tree_code code;
    1738              :   vect_def_type dt;
    1739              :   tree op;
    1740              : };
    1741              : 
    1742              : /* Comparator for sorting associatable chains.  */
    1743              : 
    1744              : static int
    1745      8449842 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
    1746              : {
    1747      8449842 :   auto *op1 = (const chain_op_t *) op1_;
    1748      8449842 :   auto *op2 = (const chain_op_t *) op2_;
    1749      8449842 :   if (op1->dt != op2->dt)
    1750      1038710 :     return (int)op1->dt - (int)op2->dt;
    1751      7411132 :   return (int)op1->code - (int)op2->code;
    1752              : }
    1753              : 
    1754              : /* Linearize the associatable expression chain at START with the
    1755              :    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
    1756              :    filling CHAIN with the result and using WORKLIST as intermediate storage.
    1757              :    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
    1758              :    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
    1759              :    stmts, starting with START.  */
    1760              : 
    1761              : static void
    1762      1633216 : vect_slp_linearize_chain (vec_info *vinfo,
    1763              :                           vec<std::pair<tree_code, gimple *> > &worklist,
    1764              :                           vec<chain_op_t> &chain,
    1765              :                           enum tree_code code, gimple *start,
    1766              :                           gimple *&code_stmt, gimple *&alt_code_stmt,
    1767              :                           vec<gimple *> *chain_stmts)
    1768              : {
    1769              :   /* For each lane linearize the addition/subtraction (or other
    1770              :      uniform associatable operation) expression tree.  */
    1771      1633216 :   worklist.safe_push (std::make_pair (code, start));
    1772      3645069 :   while (!worklist.is_empty ())
    1773              :     {
    1774      2011853 :       auto entry = worklist.pop ();
    1775      2011853 :       gassign *stmt = as_a <gassign *> (entry.second);
    1776      2011853 :       enum tree_code in_code = entry.first;
    1777      4023706 :       enum tree_code this_code = gimple_assign_rhs_code (stmt);
    1778              :       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
    1779      2011853 :       if (!code_stmt
    1780      2011853 :           && gimple_assign_rhs_code (stmt) == code)
    1781      1379727 :         code_stmt = stmt;
    1782       632126 :       else if (!alt_code_stmt
    1783       632126 :                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
    1784       330802 :         alt_code_stmt = stmt;
    1785      2011853 :       if (chain_stmts)
    1786      1962575 :         chain_stmts->safe_push (stmt);
    1787      6035559 :       for (unsigned opnum = 1; opnum <= 2; ++opnum)
    1788              :         {
    1789      4023706 :           tree op = gimple_op (stmt, opnum);
    1790      4023706 :           vect_def_type dt;
    1791      4023706 :           stmt_vec_info def_stmt_info;
    1792      4023706 :           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
    1793      4023706 :           gcc_assert (res);
    1794      4023706 :           if (dt == vect_internal_def
    1795      4023706 :               && is_pattern_stmt_p (def_stmt_info))
    1796         6495 :             op = gimple_get_lhs (def_stmt_info->stmt);
    1797      4023706 :           gimple *use_stmt;
    1798      4023706 :           use_operand_p use_p;
    1799      4023706 :           if (dt == vect_internal_def
    1800      3731472 :               && single_imm_use (op, &use_p, &use_stmt)
    1801      2301379 :               && is_gimple_assign (def_stmt_info->stmt)
    1802      6143088 :               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
    1803      1768324 :                   || (code == PLUS_EXPR
    1804       888744 :                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
    1805              :                           == MINUS_EXPR))))
    1806              :             {
    1807       378637 :               tree_code op_def_code = this_code;
    1808       378637 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1809        55589 :                 op_def_code = PLUS_EXPR;
    1810       378637 :               if (in_code == MINUS_EXPR)
    1811          193 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1812       378637 :               worklist.safe_push (std::make_pair (op_def_code,
    1813       378637 :                                                   def_stmt_info->stmt));
    1814              :             }
    1815              :           else
    1816              :             {
    1817      3645069 :               tree_code op_def_code = this_code;
    1818      3645069 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1819       279509 :                 op_def_code = PLUS_EXPR;
    1820      3645069 :               if (in_code == MINUS_EXPR)
    1821         6745 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1822      3645069 :               chain.safe_push (chain_op_t (op_def_code, dt, op));
    1823              :             }
    1824              :         }
    1825              :     }
    1826      1633216 : }
    1827              : 
    1828              : static slp_tree
    1829              : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    1830              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1831              :                        poly_uint64 *max_nunits,
    1832              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    1833              :                        scalar_stmts_to_slp_tree_map_t *bst_map);
    1834              : 
    1835              : static slp_tree
    1836      5635730 : vect_build_slp_tree (vec_info *vinfo,
    1837              :                      vec<stmt_vec_info> stmts, unsigned int group_size,
    1838              :                      poly_uint64 *max_nunits,
    1839              :                      bool *matches, unsigned *limit, unsigned *tree_size,
    1840              :                      scalar_stmts_to_slp_tree_map_t *bst_map)
    1841              : {
    1842      5635730 :   if (slp_tree *leader = bst_map->get (stmts))
    1843              :     {
    1844       389941 :       if (dump_enabled_p ())
    1845        16822 :         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
    1846        16822 :                          !(*leader)->failed ? "" : "failed ",
    1847              :                          (void *) *leader);
    1848       389941 :       if (!(*leader)->failed)
    1849              :         {
    1850       343470 :           SLP_TREE_REF_COUNT (*leader)++;
    1851       343470 :           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
    1852       343470 :           stmts.release ();
    1853       343470 :           return *leader;
    1854              :         }
    1855        46471 :       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
    1856        46471 :       return NULL;
    1857              :     }
    1858              : 
    1859              :   /* Single-lane SLP doesn't have the chance of run-away, do not account
    1860              :      it to the limit.  */
    1861      5245789 :   if (stmts.length () > 1)
    1862              :     {
    1863      3065353 :       if (*limit == 0)
    1864              :         {
    1865         1501 :           if (dump_enabled_p ())
    1866           12 :             dump_printf_loc (MSG_NOTE, vect_location,
    1867              :                              "SLP discovery limit exceeded\n");
    1868         1501 :           memset (matches, 0, sizeof (bool) * group_size);
    1869         1501 :           return NULL;
    1870              :         }
    1871      3063852 :       --*limit;
    1872              :     }
    1873              : 
    1874              :   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
    1875              :      so we can pick up backedge destinations during discovery.  */
    1876      5244288 :   slp_tree res = new _slp_tree;
    1877      5244288 :   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
    1878      5244288 :   SLP_TREE_SCALAR_STMTS (res) = stmts;
    1879      5244288 :   bst_map->put (stmts.copy (), res);
    1880              : 
    1881      5244288 :   if (dump_enabled_p ())
    1882       141845 :     dump_printf_loc (MSG_NOTE, vect_location,
    1883              :                      "starting SLP discovery for node %p\n", (void *) res);
    1884              : 
    1885      5244288 :   poly_uint64 this_max_nunits = 1;
    1886      5244288 :   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
    1887              :                                         &this_max_nunits,
    1888              :                                         matches, limit, tree_size, bst_map);
    1889      5244288 :   if (!res_)
    1890              :     {
    1891      1921916 :       if (dump_enabled_p ())
    1892         8016 :         dump_printf_loc (MSG_NOTE, vect_location,
    1893              :                          "SLP discovery for node %p failed\n", (void *) res);
    1894              :       /* Mark the node invalid so we can detect those when still in use
    1895              :          as backedge destinations.  */
    1896      1921916 :       SLP_TREE_SCALAR_STMTS (res) = vNULL;
    1897      1921916 :       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
    1898      1921916 :       res->failed = XNEWVEC (bool, group_size);
    1899      1921916 :       if (flag_checking)
    1900              :         {
    1901              :           unsigned i;
    1902      3417161 :           for (i = 0; i < group_size; ++i)
    1903      3417161 :             if (!matches[i])
    1904              :               break;
    1905      1921916 :           gcc_assert (i < group_size);
    1906              :         }
    1907      1921916 :       memcpy (res->failed, matches, sizeof (bool) * group_size);
    1908              :     }
    1909              :   else
    1910              :     {
    1911      3322372 :       if (dump_enabled_p ())
    1912       133829 :         dump_printf_loc (MSG_NOTE, vect_location,
    1913              :                          "SLP discovery for node %p succeeded\n",
    1914              :                          (void *) res);
    1915      3322372 :       gcc_assert (res_ == res);
    1916      3322372 :       res->max_nunits = this_max_nunits;
    1917      3322372 :       vect_update_max_nunits (max_nunits, this_max_nunits);
    1918              :       /* Keep a reference for the bst_map use.  */
    1919      3322372 :       SLP_TREE_REF_COUNT (res)++;
    1920              :     }
    1921              :   return res_;
    1922              : }
    1923              : 
    1924              : /* Helper for building an associated SLP node chain.  */
    1925              : 
    1926              : static void
    1927          122 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
    1928              :                                    slp_tree op0, slp_tree op1,
    1929              :                                    stmt_vec_info oper1, stmt_vec_info oper2,
    1930              :                                    vec<std::pair<unsigned, unsigned> > lperm)
    1931              : {
    1932          122 :   unsigned group_size = SLP_TREE_LANES (op1);
    1933              : 
    1934          122 :   slp_tree child1 = new _slp_tree;
    1935          122 :   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
    1936          122 :   SLP_TREE_VECTYPE (child1) = vectype;
    1937          122 :   SLP_TREE_LANES (child1) = group_size;
    1938          122 :   SLP_TREE_CHILDREN (child1).create (2);
    1939          122 :   SLP_TREE_CHILDREN (child1).quick_push (op0);
    1940          122 :   SLP_TREE_CHILDREN (child1).quick_push (op1);
    1941          122 :   SLP_TREE_REPRESENTATIVE (child1) = oper1;
    1942              : 
    1943          122 :   slp_tree child2 = new _slp_tree;
    1944          122 :   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
    1945          122 :   SLP_TREE_VECTYPE (child2) = vectype;
    1946          122 :   SLP_TREE_LANES (child2) = group_size;
    1947          122 :   SLP_TREE_CHILDREN (child2).create (2);
    1948          122 :   SLP_TREE_CHILDREN (child2).quick_push (op0);
    1949          122 :   SLP_TREE_REF_COUNT (op0)++;
    1950          122 :   SLP_TREE_CHILDREN (child2).quick_push (op1);
    1951          122 :   SLP_TREE_REF_COUNT (op1)++;
    1952          122 :   SLP_TREE_REPRESENTATIVE (child2) = oper2;
    1953              : 
    1954          122 :   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
    1955          122 :   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
    1956          122 :   SLP_TREE_VECTYPE (perm) = vectype;
    1957          122 :   SLP_TREE_LANES (perm) = group_size;
    1958              :   /* ???  We should set this NULL but that's not expected.  */
    1959          122 :   SLP_TREE_REPRESENTATIVE (perm) = oper1;
    1960          122 :   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
    1961          122 :   SLP_TREE_CHILDREN (perm).quick_push (child1);
    1962          122 :   SLP_TREE_CHILDREN (perm).quick_push (child2);
    1963          122 : }
    1964              : 
    1965              : /* Recursively build an SLP tree starting from NODE.
    1966              :    Fail (and return a value not equal to zero) if def-stmts are not
    1967              :    isomorphic, require data permutation or are of unsupported types of
    1968              :    operation.  Otherwise, return 0.
    1969              :    The value returned is the depth in the SLP tree where a mismatch
    1970              :    was found.  */
    1971              : 
    1972              : static slp_tree
    1973      5244288 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    1974              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1975              :                        poly_uint64 *max_nunits,
    1976              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    1977              :                        scalar_stmts_to_slp_tree_map_t *bst_map)
    1978              : {
    1979      5244288 :   unsigned nops, i, this_tree_size = 0;
    1980      5244288 :   poly_uint64 this_max_nunits = *max_nunits;
    1981              : 
    1982      5244288 :   matches[0] = false;
    1983              : 
    1984      5244288 :   stmt_vec_info stmt_info = stmts[0];
    1985      5244288 :   if (!is_a<gcall *> (stmt_info->stmt)
    1986              :       && !is_a<gassign *> (stmt_info->stmt)
    1987              :       && !is_a<gphi *> (stmt_info->stmt))
    1988              :     return NULL;
    1989              : 
    1990      5244217 :   nops = gimple_num_args (stmt_info->stmt);
    1991      5244217 :   if (const int *map = vect_get_operand_map (stmt_info->stmt,
    1992      5244217 :                                              STMT_VINFO_GATHER_SCATTER_P
    1993              :                                                (stmt_info)))
    1994        28783 :     nops = map[0];
    1995              : 
    1996              :   /* If the SLP node is a PHI (induction or reduction), terminate
    1997              :      the recursion.  */
    1998      5244217 :   bool *skip_args = XALLOCAVEC (bool, nops);
    1999      5244217 :   memset (skip_args, 0, sizeof (bool) * nops);
    2000      5244217 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    2001      2325194 :     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
    2002              :       {
    2003       252219 :         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
    2004       252219 :         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    2005              :                                                     group_size);
    2006       252219 :         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
    2007              :                                      max_nunits))
    2008              :           return NULL;
    2009              : 
    2010       247815 :         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
    2011       247815 :         if (def_type == vect_induction_def)
    2012              :           {
    2013              :             /* Induction PHIs are not cycles but walk the initial
    2014              :                value.  Only for inner loops through, for outer loops
    2015              :                we need to pick up the value from the actual PHIs
    2016              :                to more easily support peeling and epilogue vectorization.  */
    2017       172531 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2018       172531 :             if (!nested_in_vect_loop_p (loop, stmt_info))
    2019       171788 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2020              :             else
    2021              :               loop = loop->inner;
    2022       172531 :             skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2023              :           }
    2024        75284 :         else if (def_type == vect_reduction_def
    2025              :                  || def_type == vect_double_reduction_def
    2026              :                  || def_type == vect_nested_cycle
    2027        75284 :                  || def_type == vect_first_order_recurrence)
    2028              :           {
    2029              :             /* Else def types have to match.  */
    2030              :             stmt_vec_info other_info;
    2031              :             bool all_same = true;
    2032       166626 :             FOR_EACH_VEC_ELT (stmts, i, other_info)
    2033              :               {
    2034        92476 :                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
    2035      1706459 :                   return NULL;
    2036        92474 :                 if (other_info != stmt_info)
    2037        15673 :                   all_same = false;
    2038              :               }
    2039        74150 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2040              :             /* Reduction initial values are not explicitly represented.  */
    2041        74150 :             if (def_type != vect_first_order_recurrence
    2042        74150 :                 && gimple_bb (stmt_info->stmt) == loop->header)
    2043        71280 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2044              :             /* Reduction chain backedge defs are filled manually.
    2045              :                ???  Need a better way to identify a SLP reduction chain PHI.
    2046              :                Or a better overall way to SLP match those.  */
    2047        74150 :             if (stmts.length () > 1
    2048        74150 :                 && all_same && def_type == vect_reduction_def)
    2049         1414 :               skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2050              :           }
    2051         1132 :         else if (def_type != vect_internal_def)
    2052              :           return NULL;
    2053              :       }
    2054              : 
    2055              : 
    2056      5239811 :   bool two_operators = false;
    2057      5239811 :   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
    2058      5239811 :   tree vectype = NULL_TREE;
    2059      5239811 :   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
    2060              :                               &this_max_nunits, matches, &two_operators,
    2061              :                               &vectype))
    2062              :     return NULL;
    2063              : 
    2064              :   /* If the SLP node is a load, terminate the recursion unless masked.  */
    2065      3748887 :   if (STMT_VINFO_DATA_REF (stmt_info)
    2066      1849599 :       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    2067              :     {
    2068       794504 :       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
    2069              :         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
    2070              :       else
    2071              :         {
    2072       776407 :           *max_nunits = this_max_nunits;
    2073       776407 :           (*tree_size)++;
    2074       776407 :           node = vect_create_new_slp_node (node, stmts, 0);
    2075       776407 :           SLP_TREE_VECTYPE (node) = vectype;
    2076              :           /* And compute the load permutation.  Whether it is actually
    2077              :              a permutation depends on the unrolling factor which is
    2078              :              decided later.  */
    2079       776407 :           vec<unsigned> load_permutation;
    2080       776407 :           int j;
    2081       776407 :           stmt_vec_info load_info;
    2082       776407 :           load_permutation.create (group_size);
    2083       776407 :           stmt_vec_info first_stmt_info
    2084       776407 :             = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2085       776407 :               ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
    2086       776407 :           bool any_permute = false;
    2087      1883748 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    2088              :             {
    2089      1107341 :               int load_place;
    2090      1107341 :               if (! load_info)
    2091              :                 {
    2092        39409 :                   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2093              :                     load_place = j;
    2094              :                   else
    2095              :                     load_place = 0;
    2096              :                 }
    2097      1067932 :               else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2098       652513 :                 load_place = vect_get_place_in_interleaving_chain
    2099       652513 :                     (load_info, first_stmt_info);
    2100              :               else
    2101              :                 /* Recognize the splat case as { 0, 0, ... } but make
    2102              :                    sure to use the appropriate refs for collections
    2103              :                    of invariant refs.  */
    2104       415419 :                 load_place = (load_info == stmt_info) ? 0 : j;
    2105       692079 :               gcc_assert (load_place != -1);
    2106      1107341 :               any_permute |= load_place != j;
    2107      1107341 :               load_permutation.quick_push (load_place);
    2108              :             }
    2109              : 
    2110       776407 :           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
    2111              :             {
    2112         2350 :               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
    2113         2350 :               bool has_gaps = false;
    2114         2350 :               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2115          209 :                 for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
    2116         1346 :                      si; si = DR_GROUP_NEXT_ELEMENT (si))
    2117         1137 :                   if (DR_GROUP_GAP (si) != 1)
    2118          160 :                     has_gaps = true;
    2119              :               /* We cannot handle permuted masked loads directly, see
    2120              :                  PR114375.  We cannot handle strided masked loads or masked
    2121              :                  loads with gaps unless the mask is uniform.  */
    2122         2350 :               if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2123          209 :                    && (DR_GROUP_GAP (first_stmt_info) != 0
    2124          149 :                        || (has_gaps
    2125           55 :                            && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
    2126         4605 :                   || STMT_VINFO_STRIDED_P (stmt_info))
    2127              :                 {
    2128          108 :                   load_permutation.release ();
    2129          108 :                   matches[0] = false;
    2130       774209 :                   return NULL;
    2131              :                 }
    2132              : 
    2133              :               /* For permuted masked loads do an unpermuted masked load of
    2134              :                  the whole group followed by a SLP permute node.  */
    2135         2242 :               if (any_permute
    2136         2242 :                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2137           84 :                       && DR_GROUP_SIZE (first_stmt_info) != group_size))
    2138              :                 {
    2139              :                   /* Discover the whole unpermuted load.  */
    2140           44 :                   vec<stmt_vec_info> stmts2;
    2141           44 :                   unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2142           78 :                       ? DR_GROUP_SIZE (first_stmt_info) : 1;
    2143           44 :                   stmts2.create (dr_group_size);
    2144           44 :                   stmts2.quick_grow_cleared (dr_group_size);
    2145           44 :                   unsigned i = 0;
    2146           44 :                   for (stmt_vec_info si = first_stmt_info;
    2147          594 :                        si; si = DR_GROUP_NEXT_ELEMENT (si))
    2148              :                     {
    2149          550 :                       if (si != first_stmt_info)
    2150         2106 :                         for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
    2151         1600 :                           stmts2[i++] = NULL;
    2152          550 :                       stmts2[i++] = si;
    2153              :                     }
    2154           44 :                   bool *matches2 = XALLOCAVEC (bool, dr_group_size);
    2155           44 :                   slp_tree unperm_load
    2156           44 :                     = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
    2157              :                                            &this_max_nunits, matches2, limit,
    2158           44 :                                            &this_tree_size, bst_map);
    2159              :                   /* When we are able to do the full masked load emit that
    2160              :                      followed by 'node' being the desired final permutation.  */
    2161           44 :                   if (unperm_load)
    2162              :                     {
    2163           16 :                       gcc_assert
    2164              :                         (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
    2165           16 :                       lane_permutation_t lperm;
    2166           16 :                       lperm.create (group_size);
    2167           56 :                       for (unsigned j = 0; j < load_permutation.length (); ++j)
    2168           40 :                         lperm.quick_push
    2169           40 :                           (std::make_pair (0, load_permutation[j]));
    2170           16 :                       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2171           16 :                       SLP_TREE_CHILDREN (node).safe_push (unperm_load);
    2172           16 :                       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2173           16 :                       load_permutation.release ();
    2174           16 :                       return node;
    2175              :                     }
    2176           28 :                   stmts2.release ();
    2177           28 :                   load_permutation.release ();
    2178           28 :                   matches[0] = false;
    2179           28 :                   return NULL;
    2180              :                 }
    2181         2198 :               load_permutation.release ();
    2182              :             }
    2183              :           else
    2184              :             {
    2185       774057 :               if (!any_permute
    2186       674054 :                   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2187      1049289 :                   && group_size == DR_GROUP_SIZE (first_stmt_info))
    2188       118578 :                 load_permutation.release ();
    2189       774057 :               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
    2190       774057 :               return node;
    2191              :             }
    2192              :         }
    2193              :     }
    2194      2954383 :   else if (gimple_assign_single_p (stmt_info->stmt)
    2195      2116602 :            && !gimple_vuse (stmt_info->stmt)
    2196      2962086 :            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
    2197              :     {
    2198              :       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
    2199              :          the same SSA name vector of a compatible type to vectype.  */
    2200         2385 :       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
    2201         2385 :       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
    2202         2385 :       stmt_vec_info estmt_info;
    2203         7513 :       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
    2204              :         {
    2205         5275 :           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
    2206         5275 :           tree bfref = gimple_assign_rhs1 (estmt);
    2207         5275 :           HOST_WIDE_INT lane;
    2208         5275 :           if (!known_eq (bit_field_size (bfref),
    2209              :                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
    2210        10403 :               || !constant_multiple_p (bit_field_offset (bfref),
    2211         5128 :                                        bit_field_size (bfref), &lane))
    2212              :             {
    2213          147 :               lperm.release ();
    2214          147 :               matches[0] = false;
    2215          147 :               return NULL;
    2216              :             }
    2217         5128 :           lperm.safe_push (std::make_pair (0, (unsigned)lane));
    2218              :         }
    2219         2238 :       slp_tree vnode = vect_create_new_slp_node (vNULL);
    2220         2238 :       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
    2221              :         /* ???  We record vectype here but we hide eventually necessary
    2222              :            punning and instead rely on code generation to materialize
    2223              :            VIEW_CONVERT_EXPRs as necessary.  We instead should make
    2224              :            this explicit somehow.  */
    2225          704 :         SLP_TREE_VECTYPE (vnode) = vectype;
    2226              :       else
    2227              :         {
    2228              :           /* For different size but compatible elements we can still
    2229              :              use VEC_PERM_EXPR without punning.  */
    2230         1534 :           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
    2231              :                       && types_compatible_p (TREE_TYPE (vectype),
    2232              :                                              TREE_TYPE (TREE_TYPE (vec))));
    2233         1534 :           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
    2234              :         }
    2235         2238 :       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
    2236         2238 :       unsigned HOST_WIDE_INT const_nunits;
    2237         2238 :       if (nunits.is_constant (&const_nunits))
    2238         2238 :         SLP_TREE_LANES (vnode) = const_nunits;
    2239         2238 :       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
    2240              :       /* We are always building a permutation node even if it is an identity
    2241              :          permute to shield the rest of the vectorizer from the odd node
    2242              :          representing an actual vector without any scalar ops.
    2243              :          ???  We could hide it completely with making the permute node
    2244              :          external?  */
    2245         2238 :       node = vect_create_new_slp_node (node, stmts, 1);
    2246         2238 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2247         2238 :       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2248         2238 :       SLP_TREE_VECTYPE (node) = vectype;
    2249         2238 :       SLP_TREE_CHILDREN (node).quick_push (vnode);
    2250         2238 :       return node;
    2251              :     }
    2252              :   /* When discovery reaches an associatable operation see whether we can
    2253              :      improve that to match up lanes in a way superior to the operand
    2254              :      swapping code which at most looks at two defs.
    2255              :      ???  For BB vectorization we cannot do the brute-force search
    2256              :      for matching as we can succeed by means of builds from scalars
    2257              :      and have no good way to "cost" one build against another.  */
    2258      2951998 :   else if (is_a <loop_vec_info> (vinfo)
    2259              :            /* Do not bother for single-lane SLP.  */
    2260      1623393 :            && group_size > 1
    2261              :            /* ???  We don't handle !vect_internal_def defs below.  */
    2262        80168 :            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
    2263              :            /* ???  Do not associate a reduction, this will wreck REDUC_IDX
    2264              :               mapping as long as that exists on the stmt_info level.  */
    2265        63608 :            && STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2266        58465 :            && is_gimple_assign (stmt_info->stmt)
    2267        58197 :            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
    2268        40629 :                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
    2269      2971155 :            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
    2270        11656 :                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
    2271         9713 :                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
    2272              :     {
    2273              :       /* See if we have a chain of (mixed) adds or subtracts or other
    2274              :          associatable ops.  */
    2275        13653 :       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
    2276        13653 :       if (code == MINUS_EXPR)
    2277          686 :         code = PLUS_EXPR;
    2278        13653 :       stmt_vec_info other_op_stmt_info = NULL;
    2279        13653 :       stmt_vec_info op_stmt_info = NULL;
    2280        13653 :       unsigned chain_len = 0;
    2281        13653 :       auto_vec<chain_op_t> chain;
    2282        13653 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    2283        13653 :       auto_vec<vec<chain_op_t> > chains (group_size);
    2284        13653 :       auto_vec<slp_tree, 4> children;
    2285        13653 :       bool hard_fail = true;
    2286        14538 :       for (unsigned lane = 0; lane < group_size; ++lane)
    2287              :         {
    2288        14269 :           if (!stmts[lane])
    2289              :             {
    2290              :               /* ???  Below we require lane zero is present.  */
    2291            0 :               if (lane == 0)
    2292              :                 {
    2293              :                   hard_fail = false;
    2294        13384 :                   break;
    2295              :                 }
    2296            0 :               chains.quick_push (vNULL);
    2297            0 :               continue;
    2298              :             }
    2299              :           /* For each lane linearize the addition/subtraction (or other
    2300              :              uniform associatable operation) expression tree.  */
    2301        14269 :           gimple *op_stmt = NULL, *other_op_stmt = NULL;
    2302        14269 :           vect_slp_linearize_chain (vinfo, worklist, chain, code,
    2303        14269 :                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
    2304              :                                     NULL);
    2305        14269 :           if (!op_stmt_info && op_stmt)
    2306        13123 :             op_stmt_info = vinfo->lookup_stmt (op_stmt);
    2307        14269 :           if (!other_op_stmt_info && other_op_stmt)
    2308          722 :             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
    2309        14269 :           if (chain.length () == 2)
    2310              :             {
    2311              :               /* In a chain of just two elements resort to the regular
    2312              :                  operand swapping scheme.  Likewise if we run into a
    2313              :                  length mismatch process regularly as well as we did not
    2314              :                  process the other lanes we cannot report a good hint what
    2315              :                  lanes to try swapping in the parent.  */
    2316              :               hard_fail = false;
    2317              :               break;
    2318              :             }
    2319          888 :           else if (chain_len == 0)
    2320          309 :             chain_len = chain.length ();
    2321         1158 :           else if (chain.length () != chain_len)
    2322              :             {
    2323              :               /* ???  Here we could slip in magic to compensate with
    2324              :                  neutral operands.  */
    2325            3 :               matches[lane] = false;
    2326            3 :               if (lane != group_size - 1)
    2327            3 :                 matches[0] = false;
    2328              :               break;
    2329              :             }
    2330          885 :           chains.quick_push (chain.copy ());
    2331          885 :           chain.truncate (0);
    2332              :         }
    2333        27306 :       if (chains.length () == group_size)
    2334              :         {
    2335              :           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
    2336          269 :           if (!op_stmt_info)
    2337              :             {
    2338            2 :               hard_fail = false;
    2339            2 :               goto out;
    2340              :             }
    2341              :           /* Now we have a set of chains with the same length.  */
    2342              :           /* 1. pre-sort according to def_type and operation.  */
    2343         1042 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2344         1550 :             chains[lane].stablesort (dt_sort_cmp, vinfo);
    2345          267 :           if (dump_enabled_p ())
    2346              :             {
    2347          145 :               dump_printf_loc (MSG_NOTE, vect_location,
    2348              :                                "pre-sorted chains of %s\n",
    2349              :                                get_tree_code_name (code));
    2350          649 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2351              :                 {
    2352          504 :                   if (!stmts[lane])
    2353            0 :                     dump_printf (MSG_NOTE, "--");
    2354              :                   else
    2355         2326 :                     for (unsigned opnum = 0; opnum < chain_len; ++opnum)
    2356         3644 :                       dump_printf (MSG_NOTE, "%s %T ",
    2357         1822 :                                    get_tree_code_name (chains[lane][opnum].code),
    2358         1822 :                                    chains[lane][opnum].op);
    2359          504 :                   dump_printf (MSG_NOTE, "\n");
    2360              :                 }
    2361              :             }
    2362              :           /* 2. try to build children nodes, associating as necessary.  */
    2363              :           /* 2a. prepare and perform early checks to avoid eating into
    2364              :              discovery limit unnecessarily.  */
    2365          267 :           vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
    2366         1135 :           for (unsigned n = 0; n < chain_len; ++n)
    2367              :             {
    2368          868 :               vect_def_type dt = chains[0][n].dt;
    2369          868 :               unsigned lane;
    2370         3535 :               for (lane = 0; lane < group_size; ++lane)
    2371         5334 :                 if (stmts[lane] && chains[lane][n].dt != dt)
    2372              :                   {
    2373            0 :                     if (dt == vect_constant_def
    2374            0 :                         && chains[lane][n].dt == vect_external_def)
    2375              :                       dt = vect_external_def;
    2376            0 :                     else if (dt == vect_external_def
    2377            0 :                              && chains[lane][n].dt == vect_constant_def)
    2378              :                       ;
    2379              :                     else
    2380              :                       break;
    2381              :                   }
    2382          868 :               if (lane != group_size)
    2383              :                 {
    2384            0 :                   if (dump_enabled_p ())
    2385            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    2386              :                                      "giving up on chain due to mismatched "
    2387              :                                      "def types\n");
    2388            0 :                   matches[lane] = false;
    2389            0 :                   if (lane != group_size - 1)
    2390            0 :                     matches[0] = false;
    2391            0 :                   goto out;
    2392              :                 }
    2393          868 :               dts[n] = dt;
    2394          868 :               if (dt == vect_constant_def
    2395          868 :                   || dt == vect_external_def)
    2396              :                 {
    2397              :                   /* Check whether we can build the invariant.  If we can't
    2398              :                      we never will be able to.  */
    2399           77 :                   tree type = TREE_TYPE (chains[0][n].op);
    2400          868 :                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
    2401              :                       && (TREE_CODE (type) == BOOLEAN_TYPE
    2402              :                           || !can_duplicate_and_interleave_p (vinfo, group_size,
    2403              :                                                               type)))
    2404              :                     {
    2405              :                       matches[0] = false;
    2406              :                       goto out;
    2407              :                     }
    2408              :                 }
    2409          791 :               else if (dt != vect_internal_def)
    2410              :                 {
    2411              :                   /* Not sure, we might need sth special.
    2412              :                      gcc.dg/vect/pr96854.c,
    2413              :                      gfortran.dg/vect/fast-math-pr37021.f90
    2414              :                      and gfortran.dg/vect/pr61171.f trigger.  */
    2415              :                   /* Soft-fail for now.  */
    2416            0 :                   hard_fail = false;
    2417            0 :                   goto out;
    2418              :                 }
    2419              :             }
    2420              :           /* 2b. do the actual build.  */
    2421         1081 :           for (unsigned n = 0; n < chain_len; ++n)
    2422              :             {
    2423          833 :               vect_def_type dt = dts[n];
    2424          833 :               unsigned lane;
    2425          833 :               if (dt == vect_constant_def
    2426          833 :                   || dt == vect_external_def)
    2427              :                 {
    2428           77 :                   vec<tree> ops;
    2429           77 :                   ops.create (group_size);
    2430          397 :                   for (lane = 0; lane < group_size; ++lane)
    2431          243 :                     if (stmts[lane])
    2432          243 :                       ops.quick_push (chains[lane][n].op);
    2433              :                     else
    2434            0 :                       ops.quick_push (NULL_TREE);
    2435           77 :                   slp_tree child = vect_create_new_slp_node (ops);
    2436           77 :                   SLP_TREE_DEF_TYPE (child) = dt;
    2437           77 :                   children.safe_push (child);
    2438              :                 }
    2439              :               else
    2440              :                 {
    2441          756 :                   vec<stmt_vec_info> op_stmts;
    2442          756 :                   op_stmts.create (group_size);
    2443          756 :                   slp_tree child = NULL;
    2444              :                   /* Brute-force our way.  We have to consider a lane
    2445              :                      failing after fixing an earlier fail up in the
    2446              :                      SLP discovery recursion.  So track the current
    2447              :                      permute per lane.  */
    2448          756 :                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
    2449          756 :                   memset (perms, 0, sizeof (unsigned) * group_size);
    2450          835 :                   do
    2451              :                     {
    2452          835 :                       op_stmts.truncate (0);
    2453         4248 :                       for (lane = 0; lane < group_size; ++lane)
    2454         2578 :                         if (stmts[lane])
    2455         2578 :                           op_stmts.quick_push
    2456         2578 :                             (vinfo->lookup_def (chains[lane][n].op));
    2457              :                         else
    2458            0 :                           op_stmts.quick_push (NULL);
    2459          835 :                       child = vect_build_slp_tree (vinfo, op_stmts,
    2460              :                                                    group_size, &this_max_nunits,
    2461              :                                                    matches, limit,
    2462              :                                                    &this_tree_size, bst_map);
    2463              :                       /* ???  We're likely getting too many fatal mismatches
    2464              :                          here so maybe we want to ignore them (but then we
    2465              :                          have no idea which lanes fatally mismatched).  */
    2466          835 :                       if (child || !matches[0])
    2467              :                         break;
    2468              :                       /* Swap another lane we have not yet matched up into
    2469              :                          lanes that did not match.  If we run out of
    2470              :                          permute possibilities for a lane terminate the
    2471              :                          search.  */
    2472          257 :                       bool term = false;
    2473          257 :                       for (lane = 1; lane < group_size; ++lane)
    2474          178 :                         if (!matches[lane])
    2475              :                           {
    2476          150 :                             if (n + perms[lane] + 1 == chain_len)
    2477              :                               {
    2478              :                                 term = true;
    2479              :                                 break;
    2480              :                               }
    2481          131 :                             if (dump_enabled_p ())
    2482          113 :                               dump_printf_loc (MSG_NOTE, vect_location,
    2483              :                                                "swapping operand %d and %d "
    2484              :                                                "of lane %d\n",
    2485              :                                                n, n + perms[lane] + 1, lane);
    2486          262 :                             std::swap (chains[lane][n],
    2487          131 :                                        chains[lane][n + perms[lane] + 1]);
    2488          131 :                             perms[lane]++;
    2489              :                           }
    2490           98 :                       if (term)
    2491              :                         break;
    2492              :                     }
    2493              :                   while (1);
    2494          756 :                   if (!child)
    2495              :                     {
    2496           19 :                       if (dump_enabled_p ())
    2497           18 :                         dump_printf_loc (MSG_NOTE, vect_location,
    2498              :                                          "failed to match up op %d\n", n);
    2499           19 :                       op_stmts.release ();
    2500           19 :                       if (lane != group_size - 1)
    2501            9 :                         matches[0] = false;
    2502              :                       else
    2503           10 :                         matches[lane] = false;
    2504           19 :                       goto out;
    2505              :                     }
    2506          737 :                   if (dump_enabled_p ())
    2507              :                     {
    2508          397 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2509              :                                        "matched up op %d to\n", n);
    2510          397 :                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
    2511              :                     }
    2512          737 :                   children.safe_push (child);
    2513              :                 }
    2514              :             }
    2515              :           /* 3. build SLP nodes to combine the chain.  */
    2516          950 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2517         1416 :             if (stmts[lane] && chains[lane][0].code != code)
    2518              :               {
    2519              :                 /* See if there's any alternate all-PLUS entry.  */
    2520              :                 unsigned n;
    2521            6 :                 for (n = 1; n < chain_len; ++n)
    2522              :                   {
    2523           30 :                     for (lane = 0; lane < group_size; ++lane)
    2524           48 :                       if (stmts[lane] && chains[lane][n].code != code)
    2525              :                         break;
    2526            6 :                     if (lane == group_size)
    2527              :                       break;
    2528              :                   }
    2529            6 :                 if (n != chain_len)
    2530              :                   {
    2531              :                     /* Swap that in at first position.  */
    2532            6 :                     std::swap (children[0], children[n]);
    2533           30 :                     for (lane = 0; lane < group_size; ++lane)
    2534           24 :                       if (stmts[lane])
    2535           24 :                         std::swap (chains[lane][0], chains[lane][n]);
    2536              :                   }
    2537              :                 else
    2538              :                   {
    2539              :                     /* ???  When this triggers and we end up with two
    2540              :                        vect_constant/external_def up-front things break (ICE)
    2541              :                        spectacularly finding an insertion place for the
    2542              :                        all-constant op.  We should have a fully
    2543              :                        vect_internal_def operand though(?) so we can swap
    2544              :                        that into first place and then prepend the all-zero
    2545              :                        constant.  */
    2546            0 :                     if (dump_enabled_p ())
    2547            0 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2548              :                                        "inserting constant zero to compensate "
    2549              :                                        "for (partially) negated first "
    2550              :                                        "operand\n");
    2551            0 :                     chain_len++;
    2552            0 :                     for (lane = 0; lane < group_size; ++lane)
    2553            0 :                       if (stmts[lane])
    2554            0 :                         chains[lane].safe_insert
    2555            0 :                           (0, chain_op_t (code, vect_constant_def, NULL_TREE));
    2556            0 :                     vec<tree> zero_ops;
    2557            0 :                     zero_ops.create (group_size);
    2558            0 :                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
    2559            0 :                     for (lane = 1; lane < group_size; ++lane)
    2560            0 :                       if (stmts[lane])
    2561            0 :                         zero_ops.quick_push (zero_ops[0]);
    2562              :                       else
    2563            0 :                         zero_ops.quick_push (NULL_TREE);
    2564            0 :                     slp_tree zero = vect_create_new_slp_node (zero_ops);
    2565            0 :                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
    2566            0 :                     children.safe_insert (0, zero);
    2567              :                   }
    2568              :                 break;
    2569              :               }
    2570          809 :           for (unsigned i = 1; i < children.length (); ++i)
    2571              :             {
    2572          561 :               slp_tree op0 = children[i - 1];
    2573          561 :               slp_tree op1 = children[i];
    2574          561 :               bool this_two_op = false;
    2575         2169 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2576         3460 :                 if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
    2577              :                   {
    2578              :                     this_two_op = true;
    2579              :                     break;
    2580              :                   }
    2581          561 :               slp_tree child;
    2582          561 :               if (i == children.length () - 1)
    2583          248 :                 child = vect_create_new_slp_node (node, stmts, 2);
    2584              :               else
    2585          313 :                 child = vect_create_new_slp_node (2, ERROR_MARK);
    2586          561 :               if (this_two_op)
    2587              :                 {
    2588          122 :                   vec<std::pair<unsigned, unsigned> > lperm;
    2589          122 :                   lperm.create (group_size);
    2590          462 :                   for (unsigned lane = 0; lane < group_size; ++lane)
    2591          680 :                     lperm.quick_push (std::make_pair
    2592          340 :                       (chains[lane][i].code != chains[0][i].code, lane));
    2593          244 :                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
    2594          122 :                                                      (chains[0][i].code == code
    2595              :                                                       ? op_stmt_info
    2596              :                                                       : other_op_stmt_info),
    2597          122 :                                                      (chains[0][i].code == code
    2598              :                                                       ? other_op_stmt_info
    2599              :                                                       : op_stmt_info),
    2600              :                                                      lperm);
    2601              :                 }
    2602              :               else
    2603              :                 {
    2604          439 :                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
    2605          439 :                   SLP_TREE_VECTYPE (child) = vectype;
    2606          439 :                   SLP_TREE_LANES (child) = group_size;
    2607          439 :                   SLP_TREE_CHILDREN (child).quick_push (op0);
    2608          439 :                   SLP_TREE_CHILDREN (child).quick_push (op1);
    2609          439 :                   SLP_TREE_REPRESENTATIVE (child)
    2610          878 :                     = (chains[0][i].code == code
    2611          439 :                        ? op_stmt_info : other_op_stmt_info);
    2612              :                 }
    2613          561 :               children[i] = child;
    2614              :             }
    2615          248 :           *tree_size += this_tree_size + 1;
    2616          248 :           *max_nunits = this_max_nunits;
    2617         1244 :           while (!chains.is_empty ())
    2618          726 :             chains.pop ().release ();
    2619              :           return node;
    2620              :         }
    2621        13384 : out:
    2622        13405 :       if (dump_enabled_p ())
    2623         2775 :         dump_printf_loc (MSG_NOTE, vect_location,
    2624              :                          "failed to line up SLP graph by re-associating "
    2625              :                          "operations in lanes%s\n",
    2626              :                          !hard_fail ? " trying regular discovery" : "");
    2627        13410 :       while (!children.is_empty ())
    2628            5 :         vect_free_slp_tree (children.pop ());
    2629        13564 :       while (!chains.is_empty ())
    2630          159 :         chains.pop ().release ();
    2631              :       /* Hard-fail, otherwise we might run into quadratic processing of the
    2632              :          chains starting one stmt into the chain again.  */
    2633        13405 :       if (hard_fail)
    2634              :         return NULL;
    2635              :       /* Fall thru to normal processing.  */
    2636        13653 :     }
    2637              : 
    2638              :   /* Get at the operands, verifying they are compatible.  */
    2639      2972023 :   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
    2640      2972023 :   slp_oprnd_info oprnd_info;
    2641     15172127 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    2642              :     {
    2643     24402438 :       int res = vect_get_and_check_slp_defs (vinfo, vectype,
    2644     12201219 :                                              swap[i], skip_args,
    2645              :                                              stmts, i, &oprnds_info);
    2646     12201219 :       if (res != 0)
    2647       531005 :         matches[(res == -1) ? 0 : i] = false;
    2648     12201219 :       if (!matches[0])
    2649              :         break;
    2650              :     }
    2651     14871622 :   for (i = 0; i < group_size; ++i)
    2652     12110657 :     if (!matches[i])
    2653              :       {
    2654       211058 :         vect_free_oprnd_info (oprnds_info);
    2655       211058 :         return NULL;
    2656              :       }
    2657      8282895 :   swap = NULL;
    2658              : 
    2659      8282895 :   bool has_two_operators_perm = false;
    2660     16565790 :   auto_vec<unsigned> two_op_perm_indices[2];
    2661      2760965 :   vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
    2662              : 
    2663      2773070 :   if (two_operators && oprnds_info.length () == 2 && group_size > 2)
    2664              :     {
    2665         2723 :       unsigned idx = 0;
    2666         2723 :       hash_map<gimple *, unsigned> seen;
    2667         2723 :       vec<slp_oprnd_info> new_oprnds_info
    2668         2723 :         = vect_create_oprnd_info (1, group_size);
    2669         2723 :       bool success = true;
    2670              : 
    2671         2723 :       enum tree_code code = ERROR_MARK;
    2672         2723 :       if (oprnds_info[0]->def_stmts[0]
    2673         2723 :           && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
    2674         2665 :         code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
    2675         2723 :       basic_block bb = nullptr;
    2676              : 
    2677         5992 :       for (unsigned j = 0; j < group_size; ++j)
    2678              :         {
    2679        14323 :           FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2680              :             {
    2681        11054 :               stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
    2682        11054 :               if (!stmt_info
    2683        10843 :                   || !is_a<gassign *> (stmt_info->stmt)
    2684        10840 :                   || gimple_assign_rhs_code (stmt_info->stmt) != code
    2685        19783 :                   || skip_args[i])
    2686              :                 {
    2687              :                   success = false;
    2688         2329 :                   break;
    2689              :                 }
    2690              :               /* Avoid mixing lanes with defs in different basic-blocks.  */
    2691         8729 :               if (!bb)
    2692         2821 :                 bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
    2693         7428 :               else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
    2694              :                 {
    2695              :                   success = false;
    2696              :                   break;
    2697              :                 }
    2698              : 
    2699         8725 :               bool exists;
    2700         8725 :               unsigned &stmt_idx
    2701         8725 :                 = seen.get_or_insert (stmt_info->stmt, &exists);
    2702              : 
    2703         8725 :               if (!exists)
    2704              :                 {
    2705         7676 :                   new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
    2706         7676 :                   new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
    2707         7676 :                   stmt_idx = idx;
    2708         7676 :                   idx++;
    2709              :                 }
    2710              : 
    2711         8725 :               two_op_perm_indices[i].safe_push (stmt_idx);
    2712              :             }
    2713              : 
    2714         5598 :           if (!success)
    2715              :             break;
    2716              :         }
    2717              : 
    2718         2723 :       if (success && idx == group_size)
    2719              :         {
    2720           56 :           if (dump_enabled_p ())
    2721              :             {
    2722            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2723              :                                "Replace two_operators operands:\n");
    2724              : 
    2725            0 :               FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2726              :                 {
    2727            0 :                   dump_printf_loc (MSG_NOTE, vect_location,
    2728              :                                    "Operand %u:\n", i);
    2729            0 :                   for (unsigned j = 0; j < group_size; j++)
    2730            0 :                     dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2731            0 :                                      j, oprnd_info->def_stmts[j]->stmt);
    2732              :                 }
    2733              : 
    2734            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2735              :                                "With a single operand:\n");
    2736            0 :               for (unsigned j = 0; j < group_size; j++)
    2737            0 :                 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2738            0 :                                  j, new_oprnds_info[0]->def_stmts[j]->stmt);
    2739              :             }
    2740              : 
    2741           56 :           two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
    2742           56 :           two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
    2743              : 
    2744           56 :           new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
    2745           56 :           new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
    2746           56 :           new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
    2747           56 :           new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
    2748           56 :           new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
    2749              : 
    2750           56 :           vect_free_oprnd_info (oprnds_info);
    2751           56 :           oprnds_info = new_oprnds_info;
    2752           56 :           nops = 1;
    2753           56 :           has_two_operators_perm = true;
    2754              :         }
    2755              :       else
    2756         2667 :         vect_free_oprnd_info (new_oprnds_info);
    2757         2723 :     }
    2758              : 
    2759      5521930 :   auto_vec<slp_tree, 4> children;
    2760              : 
    2761      2760965 :   stmt_info = stmts[0];
    2762              : 
    2763      2760965 :   int reduc_idx = -1;
    2764      2760965 :   int gs_scale = 0;
    2765      2760965 :   tree gs_base = NULL_TREE;
    2766              : 
    2767              :   /* Create SLP_TREE nodes for the definition node/s.  */
    2768      7059074 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2769              :     {
    2770      4385171 :       slp_tree child = nullptr;
    2771      4385171 :       unsigned int j;
    2772              : 
    2773              :       /* We're skipping certain operands from processing, for example
    2774              :          outer loop reduction initial defs.  */
    2775      4385171 :       if (skip_args[i])
    2776              :         {
    2777       417013 :           children.safe_push (NULL);
    2778      4715122 :           continue;
    2779              :         }
    2780              : 
    2781      3968158 :       if (oprnd_info->first_dt == vect_uninitialized_def)
    2782              :         {
    2783              :           /* COND_EXPR have one too many eventually if the condition
    2784              :              is a SSA name.  */
    2785            0 :           gcc_assert (i == 3 && nops == 4);
    2786            0 :           continue;
    2787              :         }
    2788              : 
    2789      3968158 :       if (oprnd_info->first_gs_p)
    2790              :         {
    2791        21765 :           gs_scale = oprnd_info->first_gs_info.scale;
    2792        21765 :           gs_base = oprnd_info->first_gs_info.base;
    2793              :         }
    2794              : 
    2795      3968158 :       if (is_a <bb_vec_info> (vinfo)
    2796      1560561 :           && oprnd_info->first_dt == vect_internal_def
    2797      4775279 :           && !oprnd_info->any_pattern)
    2798              :         {
    2799              :           /* For BB vectorization, if all defs are the same do not
    2800              :              bother to continue the build along the single-lane
    2801              :              graph but use a splat of the scalar value.  */
    2802       764295 :           stmt_vec_info first_def = oprnd_info->def_stmts[0];
    2803       820050 :           for (j = 1; j < group_size; ++j)
    2804       779908 :             if (oprnd_info->def_stmts[j] != first_def)
    2805              :               break;
    2806       764295 :           if (j == group_size
    2807              :               /* But avoid doing this for loads where we may be
    2808              :                  able to CSE things, unless the stmt is not
    2809              :                  vectorizable.  */
    2810       764295 :               && (!STMT_VINFO_VECTORIZABLE (first_def)
    2811        49400 :                   || !gimple_vuse (first_def->stmt)))
    2812              :             {
    2813        30856 :               if (dump_enabled_p ())
    2814           93 :                 dump_printf_loc (MSG_NOTE, vect_location,
    2815              :                                  "Using a splat of the uniform operand %G",
    2816              :                                  first_def->stmt);
    2817        30856 :               oprnd_info->first_dt = vect_external_def;
    2818              :             }
    2819              :         }
    2820              : 
    2821      3968158 :       if (oprnd_info->first_dt == vect_external_def
    2822      3968158 :           || oprnd_info->first_dt == vect_constant_def)
    2823              :         {
    2824      1388155 :           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
    2825              :             {
    2826              :               tree op0;
    2827              :               tree uniform_val = op0 = oprnd_info->ops[0];
    2828              :               for (j = 1; j < oprnd_info->ops.length (); ++j)
    2829              :                 if (oprnd_info->ops[j]
    2830              :                     && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
    2831              :                   {
    2832              :                     uniform_val = NULL_TREE;
    2833              :                     break;
    2834              :                   }
    2835              :               if (!uniform_val
    2836              :                   && !can_duplicate_and_interleave_p (vinfo,
    2837              :                                                       oprnd_info->ops.length (),
    2838              :                                                       TREE_TYPE (op0)))
    2839              :                 {
    2840              :                   matches[j] = false;
    2841              :                   if (dump_enabled_p ())
    2842              :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2843              :                                      "Build SLP failed: invalid type of def "
    2844              :                                      "for variable-length SLP %T\n", op0);
    2845              :                   goto fail;
    2846              :                 }
    2847              :             }
    2848      1388155 :           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
    2849      1388155 :           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
    2850      1388155 :           oprnd_info->ops = vNULL;
    2851      1388155 :           children.safe_push (invnode);
    2852      1388155 :           continue;
    2853      1388155 :         }
    2854              : 
    2855              :       /* See which SLP operand a reduction chain continues on.  We want
    2856              :          to chain even PHIs but not backedges.  */
    2857      2580003 :       if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
    2858      2580003 :           || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
    2859              :         {
    2860       160448 :           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
    2861              :             {
    2862          638 :               if (oprnd_info->first_dt == vect_double_reduction_def)
    2863          319 :                 reduc_idx = i;
    2864              :             }
    2865       159810 :           else if (is_a <gphi *> (stmt_info->stmt)
    2866       159810 :                    && gimple_phi_num_args
    2867        70190 :                         (as_a <gphi *> (stmt_info->stmt)) != 1)
    2868              :             ;
    2869        89944 :           else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2870          324 :                    && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
    2871              :             ;
    2872        89944 :           else if (reduc_idx == -1)
    2873        85628 :             reduc_idx = i;
    2874              :           else
    2875              :             /* For .COND_* reduction operations the else value can be the
    2876              :                same as one of the operation operands.  The other def
    2877              :                stmts have been moved, so we can't check easily.  Check
    2878              :                it's a call at least.  */
    2879         4316 :             gcc_assert (is_a <gcall *> (stmt_info->stmt));
    2880              :         }
    2881              : 
    2882              :       /* When we have a masked load with uniform mask discover this
    2883              :          as a single-lane mask with a splat permute.  This way we can
    2884              :          recognize this as a masked load-lane by stripping the splat.  */
    2885      2580003 :       if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    2886        34757 :           && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    2887              :                                      IFN_MASK_LOAD)
    2888         4737 :           && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2889      2580080 :           && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
    2890              :         {
    2891           35 :           vec<stmt_vec_info> def_stmts2;
    2892           35 :           def_stmts2.create (1);
    2893           35 :           def_stmts2.quick_push (oprnd_info->def_stmts[0]);
    2894           35 :           child = vect_build_slp_tree (vinfo, def_stmts2, 1,
    2895              :                                        &this_max_nunits,
    2896              :                                        matches, limit,
    2897              :                                        &this_tree_size, bst_map);
    2898           35 :           if (child)
    2899              :             {
    2900           35 :               slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    2901           35 :               SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
    2902           35 :               SLP_TREE_LANES (pnode) = group_size;
    2903           35 :               SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
    2904           35 :               SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
    2905          210 :               for (unsigned k = 0; k < group_size; ++k)
    2906              :                 {
    2907          175 :                   SLP_TREE_SCALAR_STMTS (pnode)
    2908          175 :                     .quick_push (oprnd_info->def_stmts[0]);
    2909          175 :                   SLP_TREE_LANE_PERMUTATION (pnode)
    2910          175 :                     .quick_push (std::make_pair (0u, 0u));
    2911              :                 }
    2912           35 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    2913           35 :               pnode->max_nunits = child->max_nunits;
    2914           35 :               children.safe_push (pnode);
    2915           35 :               oprnd_info->def_stmts = vNULL;
    2916           35 :               continue;
    2917           35 :             }
    2918              :           else
    2919            0 :             def_stmts2.release ();
    2920              :         }
    2921              : 
    2922      2579968 :       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    2923              :                                         group_size, &this_max_nunits,
    2924              :                                         matches, limit,
    2925              :                                         &this_tree_size, bst_map)) != NULL)
    2926              :         {
    2927      2128815 :           oprnd_info->def_stmts = vNULL;
    2928      2128815 :           children.safe_push (child);
    2929      2128815 :           continue;
    2930              :         }
    2931              : 
    2932              :       /* If the SLP build for operand zero failed and operand zero
    2933              :          and one can be commutated try that for the scalar stmts
    2934              :          that failed the match.  */
    2935       451153 :       if (i == 0
    2936              :           /* A first scalar stmt mismatch signals a fatal mismatch.  */
    2937       354897 :           && matches[0]
    2938              :           /* ???  For COND_EXPRs we can swap the comparison operands
    2939              :              as well as the arms under some constraints.  */
    2940       168383 :           && (nops == 2 || nops == 3)
    2941       101205 :           && oprnds_info[1]->first_dt == vect_internal_def
    2942        55229 :           && (is_gimple_assign (stmt_info->stmt)
    2943        11374 :               || is_gimple_call (stmt_info->stmt))
    2944              :           /* Swapping operands for reductions breaks assumptions later on.  */
    2945       495021 :           && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    2946              :         {
    2947              :           /* See whether we can swap the matching or the non-matching
    2948              :              stmt operands.  */
    2949              :           bool swap_not_matching = true;
    2950        49309 :           do
    2951              :             {
    2952      7033912 :               for (j = 0; j < group_size; ++j)
    2953              :                 {
    2954      6998376 :                   if (matches[j] != !swap_not_matching)
    2955        64099 :                     continue;
    2956      6934277 :                   stmt_vec_info stmt_info = stmts[j];
    2957              :                   /* Verify if we can swap operands of this stmt.  */
    2958      6934277 :                   if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
    2959              :                     {
    2960      6934251 :                       tree_code code = gimple_assign_rhs_code (stmt);
    2961      6934251 :                       if (! commutative_tree_code (code)
    2962      6934251 :                           && ! commutative_ternary_tree_code (code))
    2963              :                         {
    2964        13749 :                           if (!swap_not_matching)
    2965         6339 :                             goto fail;
    2966              :                           swap_not_matching = false;
    2967              :                           break;
    2968              :                         }
    2969              :                     }
    2970      6984629 :                   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
    2971              :                     {
    2972           26 :                       internal_fn fn = (gimple_call_internal_p (call)
    2973           26 :                                         ? gimple_call_internal_fn (call)
    2974              :                                         : IFN_LAST);
    2975           26 :                       if ((! commutative_binary_fn_p (fn)
    2976           26 :                            && ! commutative_ternary_fn_p (fn))
    2977           28 :                           || first_commutative_argument (fn) != 0)
    2978              :                         {
    2979           24 :                           if (!swap_not_matching)
    2980           12 :                             goto fail;
    2981              :                           swap_not_matching = false;
    2982              :                           break;
    2983              :                         }
    2984              :                     }
    2985              :                 }
    2986              :             }
    2987        42958 :           while (j != group_size);
    2988              : 
    2989              :           /* Swap mismatched definition stmts.  */
    2990        35536 :           if (dump_enabled_p ())
    2991          345 :             dump_printf_loc (MSG_NOTE, vect_location,
    2992              :                              "Re-trying with swapped operands of stmts ");
    2993      7011912 :           for (j = 0; j < group_size; ++j)
    2994      6976376 :             if (matches[j] == !swap_not_matching)
    2995              :               {
    2996     13840696 :                 std::swap (oprnds_info[0]->def_stmts[j],
    2997      6920348 :                            oprnds_info[1]->def_stmts[j]);
    2998     13840696 :                 std::swap (oprnds_info[0]->ops[j],
    2999      6920348 :                            oprnds_info[1]->ops[j]);
    3000      6920348 :                 if (dump_enabled_p ())
    3001          938 :                   dump_printf (MSG_NOTE, "%d ", j);
    3002              :               }
    3003        35536 :           if (dump_enabled_p ())
    3004          345 :             dump_printf (MSG_NOTE, "\n");
    3005              :           /* After swapping some operands we lost track whether an
    3006              :              operand has any pattern defs so be conservative here.  */
    3007        67843 :           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
    3008         3272 :             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
    3009              :           /* And try again with scratch 'matches' ... */
    3010        35536 :           bool *tem = XALLOCAVEC (bool, group_size);
    3011        35536 :           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    3012              :                                             group_size, &this_max_nunits,
    3013              :                                             tem, limit,
    3014              :                                             &this_tree_size, bst_map)) != NULL)
    3015              :             {
    3016         5623 :               oprnd_info->def_stmts = vNULL;
    3017         5623 :               children.safe_push (child);
    3018         5623 :               continue;
    3019              :             }
    3020              :         }
    3021       445530 : fail:
    3022              : 
    3023              :       /* If the SLP build failed and we analyze a basic-block
    3024              :          simply treat nodes we fail to build as externally defined
    3025              :          (and thus build vectors from the scalar defs).
    3026              :          The cost model will reject outright expensive cases.
    3027              :          ???  This doesn't treat cases where permutation ultimatively
    3028              :          fails (or we don't try permutation below).  Ideally we'd
    3029              :          even compute a permutation that will end up with the maximum
    3030              :          SLP tree size...  */
    3031       445530 :       if (is_a <bb_vec_info> (vinfo)
    3032              :           /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3033              :              do extra work to cancel the pattern so the uses see the
    3034              :              scalar version.  */
    3035       393030 :           && !is_pattern_stmt_p (stmt_info)
    3036       814557 :           && !oprnd_info->any_pattern)
    3037              :         {
    3038              :           /* But if there's a leading vector sized set of matching stmts
    3039              :              fail here so we can split the group.  This matches the condition
    3040              :              vect_analyze_slp_instance uses.  */
    3041              :           /* ???  We might want to split here and combine the results to support
    3042              :              multiple vector sizes better.  */
    3043       578979 :           for (j = 0; j < group_size; ++j)
    3044       578979 :             if (!matches[j])
    3045              :               break;
    3046       368766 :           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
    3047       368737 :               && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
    3048              :             {
    3049       358468 :               if (dump_enabled_p ())
    3050          501 :                 dump_printf_loc (MSG_NOTE, vect_location,
    3051              :                                  "Building vector operands from scalars\n");
    3052       358468 :               this_tree_size++;
    3053       358468 :               child = vect_create_new_slp_node (oprnd_info->ops);
    3054       358468 :               children.safe_push (child);
    3055       358468 :               oprnd_info->ops = vNULL;
    3056       358468 :               continue;
    3057              :             }
    3058              :         }
    3059              : 
    3060        87062 :       gcc_assert (child == NULL);
    3061        97955 :       FOR_EACH_VEC_ELT (children, j, child)
    3062        10893 :         if (child)
    3063        10893 :           vect_free_slp_tree (child);
    3064        87062 :       vect_free_oprnd_info (oprnds_info);
    3065        87062 :       return NULL;
    3066              :     }
    3067              : 
    3068      2673903 :   vect_free_oprnd_info (oprnds_info);
    3069              : 
    3070              :   /* If we have all children of a child built up from uniform scalars
    3071              :      or does more than one possibly expensive vector construction then
    3072              :      just throw that away, causing it built up from scalars.
    3073              :      The exception is the SLP node for the vector store.  */
    3074      2673903 :   if (is_a <bb_vec_info> (vinfo)
    3075      1087131 :       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
    3076              :       /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3077              :          do extra work to cancel the pattern so the uses see the
    3078              :          scalar version.  */
    3079      3106291 :       && !is_pattern_stmt_p (stmt_info))
    3080              :     {
    3081              :       slp_tree child;
    3082              :       unsigned j;
    3083              :       bool all_uniform_p = true;
    3084              :       unsigned n_vector_builds = 0;
    3085      1229078 :       FOR_EACH_VEC_ELT (children, j, child)
    3086              :         {
    3087       821750 :           if (!child)
    3088              :             ;
    3089       821750 :           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    3090              :             all_uniform_p = false;
    3091       586721 :           else if (!vect_slp_tree_uniform_p (child))
    3092              :             {
    3093       446959 :               all_uniform_p = false;
    3094       446959 :               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
    3095       412845 :                 n_vector_builds++;
    3096              :             }
    3097              :         }
    3098       407328 :       if (all_uniform_p
    3099       407328 :           || n_vector_builds > 1
    3100       691365 :           || (n_vector_builds == children.length ()
    3101        30174 :               && is_a <gphi *> (stmt_info->stmt)))
    3102              :         {
    3103              :           /* Roll back.  */
    3104       128090 :           matches[0] = false;
    3105       407072 :           FOR_EACH_VEC_ELT (children, j, child)
    3106       278982 :             if (child)
    3107       278982 :               vect_free_slp_tree (child);
    3108              : 
    3109       128090 :           if (dump_enabled_p ())
    3110          129 :             dump_printf_loc (MSG_NOTE, vect_location,
    3111              :                              "Building parent vector operands from "
    3112              :                              "scalars instead\n");
    3113       128090 :           return NULL;
    3114              :         }
    3115              :     }
    3116              : 
    3117      2545813 :   *tree_size += this_tree_size + 1;
    3118      2545813 :   *max_nunits = this_max_nunits;
    3119              : 
    3120      2545813 :   if (two_operators)
    3121              :     {
    3122              :       /* ???  We'd likely want to either cache in bst_map sth like
    3123              :          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
    3124              :          the true { a+b, a+b, a+b, a+b } ... but there we don't have
    3125              :          explicit stmts to put in so the keying on 'stmts' doesn't
    3126              :          work (but we have the same issue with nodes that use 'ops').  */
    3127              : 
    3128         5908 :       if (has_two_operators_perm)
    3129              :         {
    3130           22 :           slp_tree child = children[0];
    3131           22 :           children.truncate (0);
    3132           66 :           for (i = 0; i < 2; i++)
    3133              :             {
    3134           44 :               slp_tree pnode
    3135           44 :                 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
    3136           44 :               SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
    3137           44 :               SLP_TREE_VECTYPE (pnode) = vectype;
    3138           44 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3139           44 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3140           44 :               lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
    3141           44 :               children.safe_push (pnode);
    3142              : 
    3143          476 :               for (unsigned j = 0; j < stmts.length (); j++)
    3144          432 :                 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
    3145              :             }
    3146              : 
    3147           22 :           SLP_TREE_REF_COUNT (child) += 4;
    3148              :         }
    3149              : 
    3150         5908 :       slp_tree one = new _slp_tree;
    3151         5908 :       slp_tree two = new _slp_tree;
    3152         5908 :       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
    3153         5908 :       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
    3154         5908 :       SLP_TREE_VECTYPE (one) = vectype;
    3155         5908 :       SLP_TREE_VECTYPE (two) = vectype;
    3156         5908 :       SLP_TREE_CHILDREN (one).safe_splice (children);
    3157         5908 :       SLP_TREE_CHILDREN (two).safe_splice (children);
    3158         5908 :       slp_tree child;
    3159        23634 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
    3160        11818 :         SLP_TREE_REF_COUNT (child)++;
    3161              : 
    3162              :       /* Here we record the original defs since this
    3163              :          node represents the final lane configuration.  */
    3164         5908 :       node = vect_create_new_slp_node (node, stmts, 2);
    3165         5908 :       SLP_TREE_VECTYPE (node) = vectype;
    3166         5908 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    3167         5908 :       SLP_TREE_CHILDREN (node).quick_push (one);
    3168         5908 :       SLP_TREE_CHILDREN (node).quick_push (two);
    3169         5908 :       enum tree_code code0 = ERROR_MARK;
    3170         5908 :       enum tree_code ocode = ERROR_MARK;
    3171         5908 :       if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
    3172         5906 :         code0 = gimple_assign_rhs_code (stmt);
    3173         5908 :       stmt_vec_info ostmt_info;
    3174         5908 :       unsigned j = 0;
    3175        22009 :       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
    3176              :         {
    3177        16101 :           int op = 0;
    3178        16101 :           if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
    3179              :             {
    3180        16097 :               if (gimple_assign_rhs_code (ostmt) != code0)
    3181              :                 {
    3182         8083 :                   ocode = gimple_assign_rhs_code (ostmt);
    3183              :                   op = 1;
    3184              :                   j = i;
    3185              :                 }
    3186              :             }
    3187              :           else
    3188              :             {
    3189            8 :               if (gimple_call_combined_fn (stmts[0]->stmt)
    3190            4 :                   != gimple_call_combined_fn (ostmt_info->stmt))
    3191              :                 {
    3192            2 :                   op = 1;
    3193            2 :                   j = i;
    3194              :                 }
    3195              :             }
    3196        16101 :           SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
    3197              :         }
    3198         5908 :       SLP_TREE_CODE (one) = code0;
    3199         5908 :       SLP_TREE_CODE (two) = ocode;
    3200         5908 :       SLP_TREE_LANES (one) = stmts.length ();
    3201         5908 :       SLP_TREE_LANES (two) = stmts.length ();
    3202         5908 :       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
    3203         5908 :       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
    3204              : 
    3205         5908 :       return node;
    3206              :     }
    3207              : 
    3208      2539905 :   node = vect_create_new_slp_node (node, stmts, nops);
    3209      2539905 :   SLP_TREE_VECTYPE (node) = vectype;
    3210      2539905 :   SLP_TREE_CHILDREN (node).splice (children);
    3211      2539905 :   SLP_TREE_GS_SCALE (node) = gs_scale;
    3212      2539905 :   SLP_TREE_GS_BASE (node) = gs_base;
    3213      2539905 :   if (reduc_idx != -1)
    3214              :     {
    3215        80797 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
    3216              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
    3217              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
    3218        80797 :       SLP_TREE_REDUC_IDX (node) = reduc_idx;
    3219        80797 :       node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
    3220              :     }
    3221              :   /* When reaching the reduction PHI, create a vect_reduc_info.  */
    3222      2459108 :   else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
    3223      2459108 :             || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3224      2459108 :            && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
    3225              :     {
    3226        71280 :       loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
    3227        71280 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
    3228        71280 :       node->cycle_info.id = loop_vinfo->reduc_infos.length ();
    3229        71280 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    3230        71280 :       loop_vinfo->reduc_infos.safe_push (reduc_info);
    3231        71280 :       stmt_vec_info reduc_phi = stmt_info;
    3232              :       /* ???  For double reductions vect_is_simple_reduction stores the
    3233              :          reduction type and code on the inner loop header PHI.  */
    3234        71280 :       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3235              :         {
    3236          319 :           use_operand_p use_p;
    3237          319 :           gimple *use_stmt;
    3238          319 :           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
    3239              :                                      &use_p, &use_stmt);
    3240          319 :           gcc_assert (res);
    3241          319 :           reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
    3242              :         }
    3243        71280 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
    3244        71280 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
    3245        71280 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
    3246        71280 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    3247              :     }
    3248              :   return node;
    3249      8282895 : }
    3250              : 
    3251              : /* Dump a single SLP tree NODE.  */
    3252              : 
    3253              : static void
    3254       437743 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
    3255              :                      slp_tree node)
    3256              : {
    3257       437743 :   unsigned i, j;
    3258       437743 :   slp_tree child;
    3259       437743 :   stmt_vec_info stmt_info;
    3260       437743 :   tree op;
    3261              : 
    3262       437743 :   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
    3263       437743 :   dump_user_location_t user_loc = loc.get_user_location ();
    3264       437743 :   dump_printf_loc (metadata, user_loc,
    3265              :                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
    3266              :                    ", refcnt=%u)",
    3267       437743 :                    SLP_TREE_DEF_TYPE (node) == vect_external_def
    3268              :                    ? " (external)"
    3269              :                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    3270       422504 :                       ? " (constant)"
    3271              :                       : ""), (void *) node,
    3272       437743 :                    estimated_poly_value (node->max_nunits),
    3273              :                                          SLP_TREE_REF_COUNT (node));
    3274       437743 :   if (SLP_TREE_VECTYPE (node))
    3275       371419 :     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
    3276       437743 :   dump_printf (metadata, "%s",
    3277       437743 :                node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
    3278       437743 :   if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
    3279        23089 :     dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
    3280              :                  node->cycle_info.reduc_idx);
    3281       437743 :   dump_printf (metadata, "\n");
    3282       437743 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    3283              :     {
    3284       356053 :       if (SLP_TREE_PERMUTE_P (node))
    3285        13548 :         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
    3286              :       else
    3287       342505 :         dump_printf_loc (metadata, user_loc, "op template: %G",
    3288       342505 :                          SLP_TREE_REPRESENTATIVE (node)->stmt);
    3289              :     }
    3290       437743 :   if (SLP_TREE_SCALAR_STMTS (node).exists ())
    3291       853136 :     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3292       505124 :       if (stmt_info)
    3293       499843 :         dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
    3294       499843 :                          STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
    3295              :                          i, stmt_info->stmt);
    3296              :       else
    3297         5281 :         dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
    3298              :   else
    3299              :     {
    3300        89731 :       dump_printf_loc (metadata, user_loc, "\t{ ");
    3301       287370 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
    3302       107908 :         dump_printf (metadata, "%T%s ", op,
    3303       107908 :                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
    3304        89731 :       dump_printf (metadata, "}\n");
    3305              :     }
    3306       437743 :   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    3307              :     {
    3308        62198 :       dump_printf_loc (metadata, user_loc, "\tload permutation {");
    3309       204260 :       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
    3310        79864 :         dump_printf (dump_kind, " %u", j);
    3311        62198 :       dump_printf (dump_kind, " }\n");
    3312              :     }
    3313       437743 :   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
    3314              :     {
    3315        13556 :       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
    3316        64464 :       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
    3317        37352 :         dump_printf (dump_kind, " %u[%u]",
    3318        37352 :                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
    3319        37352 :                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
    3320        13556 :       dump_printf (dump_kind, " }%s\n",
    3321        13556 :                    node->ldst_lanes ? " (load-lanes)" : "");
    3322              :     }
    3323       437743 :   if (SLP_TREE_CHILDREN (node).is_empty ())
    3324       166051 :     return;
    3325       271692 :   dump_printf_loc (metadata, user_loc, "\tchildren");
    3326       988783 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3327       445399 :     dump_printf (dump_kind, " %p", (void *)child);
    3328       271692 :   dump_printf (dump_kind, "%s\n",
    3329       271692 :                node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
    3330              :                ? " (store-lanes)" : "");
    3331              : }
    3332              : 
    3333              : DEBUG_FUNCTION void
    3334            0 : debug (slp_tree node)
    3335              : {
    3336            0 :   debug_dump_context ctx;
    3337            0 :   vect_print_slp_tree (MSG_NOTE,
    3338            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3339              :                        node);
    3340            0 : }
    3341              : 
    3342              : /* Recursive helper for the dot producer below.  */
    3343              : 
    3344              : static void
    3345            0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
    3346              : {
    3347            0 :   if (visited.add (node))
    3348              :     return;
    3349              : 
    3350            0 :   fprintf (f, "\"%p\" [label=\"", (void *)node);
    3351            0 :   vect_print_slp_tree (MSG_NOTE,
    3352            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3353              :                        node);
    3354            0 :   fprintf (f, "\"];\n");
    3355              : 
    3356              : 
    3357            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3358            0 :     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
    3359              : 
    3360            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3361            0 :     if (child)
    3362            0 :       dot_slp_tree (f, child, visited);
    3363              : }
    3364              : 
    3365              : DEBUG_FUNCTION void
    3366            0 : dot_slp_tree (const char *fname, slp_tree node)
    3367              : {
    3368            0 :   FILE *f = fopen (fname, "w");
    3369            0 :   fprintf (f, "digraph {\n");
    3370            0 :   fflush (f);
    3371            0 :     {
    3372            0 :       debug_dump_context ctx (f);
    3373            0 :       hash_set<slp_tree> visited;
    3374            0 :       dot_slp_tree (f, node, visited);
    3375            0 :     }
    3376            0 :   fflush (f);
    3377            0 :   fprintf (f, "}\n");
    3378            0 :   fclose (f);
    3379            0 : }
    3380              : 
    3381              : DEBUG_FUNCTION void
    3382            0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
    3383              : {
    3384            0 :   FILE *f = fopen (fname, "w");
    3385            0 :   fprintf (f, "digraph {\n");
    3386            0 :   fflush (f);
    3387            0 :     {
    3388            0 :       debug_dump_context ctx (f);
    3389            0 :       hash_set<slp_tree> visited;
    3390            0 :       for (auto inst : slp_instances)
    3391            0 :         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
    3392            0 :     }
    3393            0 :   fflush (f);
    3394            0 :   fprintf (f, "}\n");
    3395            0 :   fclose (f);
    3396            0 : }
    3397              : 
    3398              : /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
    3399              : 
    3400              : static void
    3401       476877 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3402              :                       slp_tree node, hash_set<slp_tree> &visited)
    3403              : {
    3404       476877 :   unsigned i;
    3405       476877 :   slp_tree child;
    3406              : 
    3407       476877 :   if (visited.add (node))
    3408       476877 :     return;
    3409              : 
    3410       437293 :   vect_print_slp_tree (dump_kind, loc, node);
    3411              : 
    3412      1319471 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3413       444885 :     if (child)
    3414       403026 :       vect_print_slp_graph (dump_kind, loc, child, visited);
    3415              : }
    3416              : 
    3417              : static void
    3418        45559 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3419              :                       slp_tree entry)
    3420              : {
    3421        45559 :   hash_set<slp_tree> visited;
    3422        45559 :   vect_print_slp_graph (dump_kind, loc, entry, visited);
    3423        45559 : }
    3424              : 
    3425              : DEBUG_FUNCTION void
    3426            0 : debug (slp_instance instance)
    3427              : {
    3428            0 :   debug_dump_context ctx;
    3429            0 :   vect_print_slp_graph (MSG_NOTE,
    3430            0 :                         dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3431              :                         SLP_INSTANCE_TREE (instance));
    3432            0 : }
    3433              : 
    3434              : /* Mark the tree rooted at NODE with PURE_SLP.  */
    3435              : 
    3436              : static void
    3437      5581674 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
    3438              :                      hash_set<slp_tree> &visited)
    3439              : {
    3440      5581674 :   int i;
    3441      5581674 :   stmt_vec_info stmt_info;
    3442      5581674 :   slp_tree child;
    3443              : 
    3444      5581674 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3445              :     return;
    3446              : 
    3447      3998563 :   if (visited.add (node))
    3448              :     return;
    3449              : 
    3450      9510244 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3451      5786021 :     if (stmt_info)
    3452              :       {
    3453      5724568 :         STMT_SLP_TYPE (stmt_info) = pure_slp;
    3454              :         /* ???  For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
    3455              :            when there is the mask_conversion pattern applied we have lost the
    3456              :            alternate lanes of the uniform mask which nevertheless
    3457              :            have separate pattern defs.  To not confuse hybrid
    3458              :            analysis we mark those as covered as well here.  */
    3459      5724568 :         if (node->ldst_lanes)
    3460      5786021 :           if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
    3461            0 :             if (gimple_call_internal_p (call, IFN_MASK_LOAD)
    3462            0 :                 || gimple_call_internal_p (call, IFN_MASK_STORE))
    3463              :               {
    3464            0 :                 tree mask = gimple_call_arg (call,
    3465              :                                              internal_fn_mask_index
    3466            0 :                                              (gimple_call_internal_fn (call)));
    3467            0 :                 if (TREE_CODE (mask) == SSA_NAME)
    3468            0 :                   if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
    3469              :                     {
    3470            0 :                       mask_info = vect_stmt_to_vectorize (mask_info);
    3471            0 :                       STMT_SLP_TYPE (mask_info) = pure_slp;
    3472              :                     }
    3473              :               }
    3474              :       }
    3475              : 
    3476      8383762 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3477      4659539 :     if (child)
    3478      4144642 :       vect_mark_slp_stmts (vinfo, child, visited);
    3479              : }
    3480              : 
    3481              : static void
    3482      1437032 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
    3483              : {
    3484      1437032 :   hash_set<slp_tree> visited;
    3485      1437032 :   vect_mark_slp_stmts (vinfo, node, visited);
    3486      1437032 : }
    3487              : 
    3488              : /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
    3489              : 
    3490              : static void
    3491      2319255 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
    3492              : {
    3493      2319255 :   int i;
    3494      2319255 :   stmt_vec_info stmt_info;
    3495      2319255 :   slp_tree child;
    3496              : 
    3497      2319255 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3498              :     return;
    3499              : 
    3500      1362522 :   if (visited.add (node))
    3501              :     return;
    3502              : 
    3503      4239704 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3504      2983340 :     if (stmt_info)
    3505              :       {
    3506      2983340 :         gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
    3507              :                     || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
    3508      2983340 :         STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
    3509              :       }
    3510              : 
    3511      2802391 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3512      1546027 :     if (child)
    3513      1546027 :       vect_mark_slp_stmts_relevant (child, visited);
    3514              : }
    3515              : 
    3516              : static void
    3517       773228 : vect_mark_slp_stmts_relevant (slp_tree node)
    3518              : {
    3519       773228 :   hash_set<slp_tree> visited;
    3520       773228 :   vect_mark_slp_stmts_relevant (node, visited);
    3521       773228 : }
    3522              : 
    3523              : 
    3524              : /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
    3525              : 
    3526              : static void
    3527      9191610 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
    3528              :                        hash_set<slp_tree> &visited)
    3529              : {
    3530      9191610 :   if (!node || visited.add (node))
    3531      1408610 :     return;
    3532              : 
    3533      7783000 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3534              :     return;
    3535              : 
    3536      5671303 :   if (!SLP_TREE_PERMUTE_P (node))
    3537              :     {
    3538      5494249 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    3539      5494249 :       if (STMT_VINFO_DATA_REF (stmt_info)
    3540      2442863 :           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    3541      1356725 :         loads.safe_push (node);
    3542              :     }
    3543              : 
    3544              :   unsigned i;
    3545              :   slp_tree child;
    3546     12820823 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3547      7149520 :     vect_gather_slp_loads (loads, child, visited);
    3548              : }
    3549              : 
    3550              : 
    3551              : /* Find the last store in SLP INSTANCE.  */
    3552              : 
    3553              : stmt_vec_info
    3554      2706035 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
    3555              : {
    3556      2706035 :   stmt_vec_info last = NULL;
    3557      2706035 :   stmt_vec_info stmt_vinfo;
    3558              : 
    3559      9853121 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3560      7147086 :     if (stmt_vinfo)
    3561              :       {
    3562      7147086 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3563      7147086 :         last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
    3564              :       }
    3565              : 
    3566      2706035 :   return last;
    3567              : }
    3568              : 
    3569              : /* Find the first stmt in NODE.  */
    3570              : 
    3571              : stmt_vec_info
    3572       520860 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
    3573              : {
    3574       520860 :   stmt_vec_info first = NULL;
    3575       520860 :   stmt_vec_info stmt_vinfo;
    3576              : 
    3577      1754885 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3578      1234025 :     if (stmt_vinfo)
    3579              :       {
    3580      1231331 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3581      1231331 :         if (!first
    3582      1231331 :             || get_later_stmt (stmt_vinfo, first) == first)
    3583              :           first = stmt_vinfo;
    3584              :       }
    3585              : 
    3586       520860 :   return first;
    3587              : }
    3588              : 
    3589              : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
    3590              :    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
    3591              :    (also containing the first GROUP1_SIZE stmts, since stores are
    3592              :    consecutive), the second containing the remainder.
    3593              :    Return the first stmt in the second group.  */
    3594              : 
    3595              : static stmt_vec_info
    3596       156486 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
    3597              : {
    3598       156486 :   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
    3599       156486 :   gcc_assert (group1_size > 0);
    3600       156486 :   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
    3601       156486 :   gcc_assert (group2_size > 0);
    3602       156486 :   DR_GROUP_SIZE (first_vinfo) = group1_size;
    3603              : 
    3604       156486 :   stmt_vec_info stmt_info = first_vinfo;
    3605       523237 :   for (unsigned i = group1_size; i > 1; i--)
    3606              :     {
    3607       366751 :       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3608       366751 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3609              :     }
    3610              :   /* STMT is now the last element of the first group.  */
    3611       156486 :   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3612       156486 :   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
    3613              : 
    3614       156486 :   DR_GROUP_SIZE (group2) = group2_size;
    3615       437230 :   for (stmt_info = group2; stmt_info;
    3616       280744 :        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
    3617              :     {
    3618       280744 :       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
    3619       280744 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3620              :     }
    3621              : 
    3622              :   /* For the second group, the DR_GROUP_GAP is that before the original group,
    3623              :      plus skipping over the first vector.  */
    3624       156486 :   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
    3625              : 
    3626              :   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
    3627       156486 :   DR_GROUP_GAP (first_vinfo) += group2_size;
    3628              : 
    3629       156486 :   if (dump_enabled_p ())
    3630           61 :     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
    3631              :                      group1_size, group2_size);
    3632              : 
    3633       156486 :   return group2;
    3634              : }
    3635              : 
    3636              : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
    3637              :    statements and a vector of NUNITS elements.  */
    3638              : 
    3639              : static poly_uint64
    3640      3666989 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
    3641              : {
    3642      3666989 :   return exact_div (common_multiple (nunits, group_size), group_size);
    3643              : }
    3644              : 
    3645              : /* Helper that checks to see if a node is a load node.  */
    3646              : 
    3647              : static inline bool
    3648           54 : vect_is_slp_load_node  (slp_tree root)
    3649              : {
    3650           54 :   return (!SLP_TREE_PERMUTE_P (root)
    3651           54 :           && SLP_TREE_DEF_TYPE (root) == vect_internal_def
    3652           48 :           && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
    3653           94 :           && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
    3654              : }
    3655              : 
    3656              : 
    3657              : /* Helper function of optimize_load_redistribution that performs the operation
    3658              :    recursively.  */
    3659              : 
    3660              : static slp_tree
    3661        20132 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
    3662              :                                 vec_info *vinfo, unsigned int group_size,
    3663              :                                 hash_map<slp_tree, slp_tree> *load_map,
    3664              :                                 slp_tree root)
    3665              : {
    3666        20132 :   if (slp_tree *leader = load_map->get (root))
    3667         3576 :     return *leader;
    3668              : 
    3669        16556 :   slp_tree node;
    3670        16556 :   unsigned i;
    3671              : 
    3672              :   /* For now, we don't know anything about externals so do not do anything.  */
    3673        16556 :   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
    3674              :     return NULL;
    3675        12002 :   else if (SLP_TREE_PERMUTE_P (root))
    3676              :     {
    3677              :       /* First convert this node into a load node and add it to the leaves
    3678              :          list and flatten the permute from a lane to a load one.  If it's
    3679              :          unneeded it will be elided later.  */
    3680           34 :       vec<stmt_vec_info> stmts;
    3681           34 :       stmts.create (SLP_TREE_LANES (root));
    3682           34 :       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
    3683           74 :       for (unsigned j = 0; j < lane_perm.length (); j++)
    3684              :         {
    3685           54 :           std::pair<unsigned, unsigned> perm = lane_perm[j];
    3686           54 :           node = SLP_TREE_CHILDREN (root)[perm.first];
    3687              : 
    3688           54 :           if (!vect_is_slp_load_node (node)
    3689           54 :               || SLP_TREE_CHILDREN (node).exists ())
    3690              :             {
    3691           14 :               stmts.release ();
    3692           14 :               goto next;
    3693              :             }
    3694              : 
    3695           40 :           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
    3696              :         }
    3697              : 
    3698           20 :       if (dump_enabled_p ())
    3699            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    3700              :                          "converting stmts on permute node %p\n",
    3701              :                          (void *) root);
    3702              : 
    3703           20 :       bool *matches = XALLOCAVEC (bool, group_size);
    3704           20 :       poly_uint64 max_nunits = 1;
    3705           20 :       unsigned tree_size = 0, limit = 1;
    3706           20 :       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
    3707              :                                   matches, &limit, &tree_size, bst_map);
    3708           20 :       if (!node)
    3709            0 :         stmts.release ();
    3710              : 
    3711           20 :       load_map->put (root, node);
    3712           20 :       return node;
    3713              :     }
    3714              : 
    3715        11968 : next:
    3716        11982 :   load_map->put (root, NULL);
    3717              : 
    3718        28363 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3719              :     {
    3720        16381 :       slp_tree value
    3721        16381 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3722              :                                           node);
    3723        16381 :       if (value)
    3724              :         {
    3725           20 :           SLP_TREE_REF_COUNT (value)++;
    3726           20 :           SLP_TREE_CHILDREN (root)[i] = value;
    3727              :           /* ???  We know the original leafs of the replaced nodes will
    3728              :              be referenced by bst_map, only the permutes created by
    3729              :              pattern matching are not.  */
    3730           20 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3731           20 :             load_map->remove (node);
    3732           20 :           vect_free_slp_tree (node);
    3733              :         }
    3734              :     }
    3735              : 
    3736              :   return NULL;
    3737              : }
    3738              : 
    3739              : /* Temporary workaround for loads not being CSEd during SLP build.  This
    3740              :    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
    3741              :    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
    3742              :    same DR such that the final operation is equal to a permuted load.  Such
    3743              :    NODES are then directly converted into LOADS themselves.  The nodes are
    3744              :    CSEd using BST_MAP.  */
    3745              : 
    3746              : static void
    3747         2835 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
    3748              :                               vec_info *vinfo, unsigned int group_size,
    3749              :                               hash_map<slp_tree, slp_tree> *load_map,
    3750              :                               slp_tree root)
    3751              : {
    3752         2835 :   slp_tree node;
    3753         2835 :   unsigned i;
    3754              : 
    3755         6586 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3756              :     {
    3757         3751 :       slp_tree value
    3758         3751 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3759              :                                           node);
    3760         3751 :       if (value)
    3761              :         {
    3762            0 :           SLP_TREE_REF_COUNT (value)++;
    3763            0 :           SLP_TREE_CHILDREN (root)[i] = value;
    3764              :           /* ???  We know the original leafs of the replaced nodes will
    3765              :              be referenced by bst_map, only the permutes created by
    3766              :              pattern matching are not.  */
    3767            0 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3768            0 :             load_map->remove (node);
    3769            0 :           vect_free_slp_tree (node);
    3770              :         }
    3771              :     }
    3772         2835 : }
    3773              : 
    3774              : /* Helper function of vect_match_slp_patterns.
    3775              : 
    3776              :    Attempts to match patterns against the slp tree rooted in REF_NODE using
    3777              :    VINFO.  Patterns are matched in post-order traversal.
    3778              : 
    3779              :    If matching is successful the value in REF_NODE is updated and returned, if
    3780              :    not then it is returned unchanged.  */
    3781              : 
    3782              : static bool
    3783      5456673 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
    3784              :                            slp_tree_to_load_perm_map_t *perm_cache,
    3785              :                            slp_compat_nodes_map_t *compat_cache,
    3786              :                            hash_set<slp_tree> *visited)
    3787              : {
    3788      5456673 :   unsigned i;
    3789      5456673 :   slp_tree node = *ref_node;
    3790      5456673 :   bool found_p = false;
    3791      5456673 :   if (!node || visited->add (node))
    3792       722047 :     return false;
    3793              : 
    3794              :   slp_tree child;
    3795      8745862 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3796      4011236 :     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
    3797              :                                           vinfo, perm_cache, compat_cache,
    3798              :                                           visited);
    3799              : 
    3800     14203878 :   for (unsigned x = 0; x < num__slp_patterns; x++)
    3801              :     {
    3802      9469252 :       vect_pattern *pattern
    3803      9469252 :         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
    3804      9469252 :       if (pattern)
    3805              :         {
    3806         1081 :           pattern->build (vinfo);
    3807         1081 :           delete pattern;
    3808         1081 :           found_p = true;
    3809              :         }
    3810              :     }
    3811              : 
    3812              :   return found_p;
    3813              : }
    3814              : 
    3815              : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
    3816              :    vec_info VINFO.
    3817              : 
    3818              :    The modified tree is returned.  Patterns are tried in order and multiple
    3819              :    patterns may match.  */
    3820              : 
    3821              : static bool
    3822      1445437 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
    3823              :                          hash_set<slp_tree> *visited,
    3824              :                          slp_tree_to_load_perm_map_t *perm_cache,
    3825              :                          slp_compat_nodes_map_t *compat_cache)
    3826              : {
    3827      1445437 :   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
    3828      1445437 :   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
    3829              : 
    3830      1445437 :   if (dump_enabled_p ())
    3831        29452 :     dump_printf_loc (MSG_NOTE, vect_location,
    3832              :                      "Analyzing SLP tree %p for patterns\n",
    3833        29452 :                      (void *) SLP_INSTANCE_TREE (instance));
    3834              : 
    3835      1445437 :   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
    3836      1445437 :                                     visited);
    3837              : }
    3838              : 
    3839              : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
    3840              :    vectorizing with VECTYPE that might be NULL.  MASKED_P indicates whether
    3841              :    the stores are masked.
    3842              :    Return true if we could use IFN_STORE_LANES instead and if that appears
    3843              :    to be the better approach.  */
    3844              : 
    3845              : static bool
    3846         4866 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
    3847              :                                tree vectype, bool masked_p,
    3848              :                                unsigned int group_size,
    3849              :                                unsigned int new_group_size)
    3850              : {
    3851         4866 :   if (!vectype)
    3852              :     {
    3853         4866 :       tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    3854         4866 :       vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
    3855              :     }
    3856         4866 :   if (!vectype)
    3857              :     return false;
    3858              :   /* Allow the split if one of the two new groups would operate on full
    3859              :      vectors *within* rather than across one scalar loop iteration.
    3860              :      This is purely a heuristic, but it should work well for group
    3861              :      sizes of 3 and 4, where the possible splits are:
    3862              : 
    3863              :        3->2+1:  OK if the vector has exactly two elements
    3864              :        4->2+2:  Likewise
    3865              :        4->3+1:  Less clear-cut.  */
    3866         4866 :   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
    3867         2537 :       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    3868         2346 :     return false;
    3869         2520 :   return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
    3870              : }
    3871              : 
    3872              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    3873              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    3874              :    Return FALSE if it's impossible to SLP any stmt in the loop.  */
    3875              : 
    3876              : static bool
    3877              : vect_analyze_slp_instance (vec_info *vinfo,
    3878              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    3879              :                            stmt_vec_info stmt_info, slp_instance_kind kind,
    3880              :                            unsigned max_tree_size, unsigned *limit,
    3881              :                            bool force_single_lane);
    3882              : 
    3883              : /* Build an interleaving scheme for the store sources RHS_NODES from
    3884              :    SCALAR_STMTS.  */
    3885              : 
    3886              : static slp_tree
    3887         6204 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
    3888              :                                    vec<stmt_vec_info> &scalar_stmts,
    3889              :                                    poly_uint64 max_nunits)
    3890              : {
    3891         6204 :   unsigned int group_size = scalar_stmts.length ();
    3892        12408 :   slp_tree node = vect_create_new_slp_node (scalar_stmts,
    3893         6204 :                                             SLP_TREE_CHILDREN
    3894              :                                               (rhs_nodes[0]).length ());
    3895         6204 :   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    3896         6204 :   node->max_nunits = max_nunits;
    3897         6204 :   for (unsigned l = 0;
    3898        12435 :        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
    3899              :     {
    3900              :       /* And a permute merging all RHS SLP trees.  */
    3901         6231 :       slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
    3902         6231 :                                                 VEC_PERM_EXPR);
    3903         6231 :       SLP_TREE_CHILDREN (node).quick_push (perm);
    3904         6231 :       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
    3905         6231 :       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
    3906         6231 :       perm->max_nunits = max_nunits;
    3907         6231 :       SLP_TREE_LANES (perm) = group_size;
    3908              :       /* ???  We should set this NULL but that's not expected.  */
    3909         6231 :       SLP_TREE_REPRESENTATIVE (perm)
    3910         6231 :         = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
    3911        24558 :       for (unsigned j = 0; j < rhs_nodes.length (); ++j)
    3912              :         {
    3913        18327 :           SLP_TREE_CHILDREN (perm)
    3914        18327 :             .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
    3915        18327 :           SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
    3916        18327 :           for (unsigned k = 0;
    3917        38665 :                k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
    3918              :             {
    3919              :               /* ???  We should populate SLP_TREE_SCALAR_STMTS
    3920              :                  or SLP_TREE_SCALAR_OPS but then we might have
    3921              :                  a mix of both in our children.  */
    3922        20338 :               SLP_TREE_LANE_PERMUTATION (perm)
    3923        20338 :                 .quick_push (std::make_pair (j, k));
    3924              :             }
    3925              :         }
    3926              : 
    3927              :       /* Now we have a single permute node but we cannot code-generate
    3928              :          the case with more than two inputs.
    3929              :          Perform pairwise reduction, reducing the two inputs
    3930              :          with the least number of lanes to one and then repeat until
    3931              :          we end up with two inputs.  That scheme makes sure we end
    3932              :          up with permutes satisfying the restriction of requiring at
    3933              :          most two vector inputs to produce a single vector output
    3934              :          when the number of lanes is even.  */
    3935        12096 :       while (SLP_TREE_CHILDREN (perm).length () > 2)
    3936              :         {
    3937              :           /* When we have three equal sized groups left the pairwise
    3938              :              reduction does not result in a scheme that avoids using
    3939              :              three vectors.  Instead merge the first two groups
    3940              :              to the final size with do-not-care elements (chosen
    3941              :              from the first group) and then merge with the third.
    3942              :                   { A0, B0,  x, A1, B1,  x, ... }
    3943              :                -> { A0, B0, C0, A1, B1, C1, ... }
    3944              :              This handles group size of three (and at least
    3945              :              power-of-two multiples of that).  */
    3946         5865 :           if (SLP_TREE_CHILDREN (perm).length () == 3
    3947         3022 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    3948         3022 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
    3949         5865 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    3950         2280 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
    3951              :             {
    3952         2084 :               int ai = 0;
    3953         2084 :               int bi = 1;
    3954         2084 :               slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    3955         2084 :               slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    3956         2084 :               unsigned n = SLP_TREE_LANES (perm);
    3957              : 
    3958         2084 :               slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    3959         2084 :               SLP_TREE_LANES (permab) = n;
    3960         2084 :               SLP_TREE_LANE_PERMUTATION (permab).create (n);
    3961         2084 :               SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    3962         2084 :               permab->max_nunits = max_nunits;
    3963              :               /* ???  Should be NULL but that's not expected.  */
    3964         2084 :               SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    3965         2084 :               SLP_TREE_CHILDREN (permab).quick_push (a);
    3966         4179 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    3967         2095 :                 SLP_TREE_LANE_PERMUTATION (permab)
    3968         2095 :                   .quick_push (std::make_pair (0, k));
    3969         2084 :               SLP_TREE_CHILDREN (permab).quick_push (b);
    3970         4179 :               for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    3971         2095 :                 SLP_TREE_LANE_PERMUTATION (permab)
    3972         2095 :                   .quick_push (std::make_pair (1, k));
    3973              :               /* Push the do-not-care lanes.  */
    3974         4179 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    3975         2095 :                 SLP_TREE_LANE_PERMUTATION (permab)
    3976         2095 :                   .quick_push (std::make_pair (0, k));
    3977              : 
    3978              :               /* Put the merged node into 'perm', in place of a.  */
    3979         2084 :               SLP_TREE_CHILDREN (perm)[ai] = permab;
    3980              :               /* Adjust the references to b in the permutation
    3981              :                  of perm and to the later children which we'll
    3982              :                  remove.  */
    3983         8369 :               for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    3984              :                 {
    3985         6285 :                   std::pair<unsigned, unsigned> &p
    3986         6285 :                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
    3987         6285 :                   if (p.first == (unsigned) bi)
    3988              :                     {
    3989         2095 :                       p.first = ai;
    3990         2095 :                       p.second += SLP_TREE_LANES (a);
    3991              :                     }
    3992         4190 :                   else if (p.first > (unsigned) bi)
    3993         2095 :                     p.first--;
    3994              :                 }
    3995         2084 :               SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    3996         2084 :               break;
    3997              :             }
    3998              : 
    3999              :           /* Pick the two nodes with the least number of lanes,
    4000              :              prefer the earliest candidate and maintain ai < bi.  */
    4001              :           int ai = -1;
    4002              :           int bi = -1;
    4003        33069 :           for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
    4004              :             {
    4005        29288 :               if (ai == -1)
    4006         3781 :                 ai = ci;
    4007        25507 :               else if (bi == -1)
    4008         3781 :                 bi = ci;
    4009        21726 :               else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4010        21726 :                         < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
    4011        21726 :                        || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4012        17812 :                            < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
    4013              :                 {
    4014         8714 :                   if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
    4015         4357 :                       <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
    4016         2074 :                     bi = ci;
    4017              :                   else
    4018              :                     {
    4019         2283 :                       ai = bi;
    4020         2283 :                       bi = ci;
    4021              :                     }
    4022              :                 }
    4023              :             }
    4024              : 
    4025              :           /* Produce a merge of nodes ai and bi.  */
    4026         3781 :           slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    4027         3781 :           slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    4028         3781 :           unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
    4029         3781 :           slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    4030         3781 :           SLP_TREE_LANES (permab) = n;
    4031         3781 :           SLP_TREE_LANE_PERMUTATION (permab).create (n);
    4032         3781 :           SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    4033         3781 :           permab->max_nunits = max_nunits;
    4034              :           /* ???  Should be NULL but that's not expected.  */
    4035         3781 :           SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    4036         3781 :           SLP_TREE_CHILDREN (permab).quick_push (a);
    4037         9886 :           for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4038         6105 :             SLP_TREE_LANE_PERMUTATION (permab)
    4039         6105 :               .quick_push (std::make_pair (0, k));
    4040         3781 :           SLP_TREE_CHILDREN (permab).quick_push (b);
    4041         9398 :           for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    4042         5617 :             SLP_TREE_LANE_PERMUTATION (permab)
    4043         5617 :               .quick_push (std::make_pair (1, k));
    4044              : 
    4045              :           /* Put the merged node into 'perm', in place of a.  */
    4046         3781 :           SLP_TREE_CHILDREN (perm)[ai] = permab;
    4047              :           /* Adjust the references to b in the permutation
    4048              :              of perm and to the later children which we'll
    4049              :              remove.  */
    4050        52693 :           for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    4051              :             {
    4052        48912 :               std::pair<unsigned, unsigned> &p
    4053        48912 :                 = SLP_TREE_LANE_PERMUTATION (perm)[k];
    4054        48912 :               if (p.first == (unsigned) bi)
    4055              :                 {
    4056         5617 :                   p.first = ai;
    4057         5617 :                   p.second += SLP_TREE_LANES (a);
    4058              :                 }
    4059        43295 :               else if (p.first > (unsigned) bi)
    4060        17862 :                 p.first--;
    4061              :             }
    4062         3781 :           SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    4063              :         }
    4064              :     }
    4065              : 
    4066         6204 :   return node;
    4067              : }
    4068              : 
    4069              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4070              :    of KIND.  Return true if successful.  SCALAR_STMTS is owned by this
    4071              :    function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
    4072              :    the caller upon failure.  */
    4073              : 
    4074              : static bool
    4075      1790563 : vect_build_slp_instance (vec_info *vinfo,
    4076              :                          slp_instance_kind kind,
    4077              :                          vec<stmt_vec_info> &scalar_stmts,
    4078              :                          vec<stmt_vec_info> &root_stmt_infos,
    4079              :                          vec<tree> &remain,
    4080              :                          unsigned max_tree_size, unsigned *limit,
    4081              :                          scalar_stmts_to_slp_tree_map_t *bst_map,
    4082              :                          bool force_single_lane)
    4083              : {
    4084              :   /* If there's no budget left bail out early.  */
    4085      1790563 :   if (*limit == 0)
    4086              :     {
    4087        27205 :       scalar_stmts.release ();
    4088        27205 :       return false;
    4089              :     }
    4090              : 
    4091      1763358 :   if (kind == slp_inst_kind_ctor)
    4092              :     {
    4093        12453 :       if (dump_enabled_p ())
    4094           86 :         dump_printf_loc (MSG_NOTE, vect_location,
    4095              :                          "Analyzing vectorizable constructor: %G\n",
    4096           43 :                          root_stmt_infos[0]->stmt);
    4097              :     }
    4098      1750905 :   else if (kind == slp_inst_kind_gcond)
    4099              :     {
    4100       272810 :       if (dump_enabled_p ())
    4101         5260 :         dump_printf_loc (MSG_NOTE, vect_location,
    4102              :                          "Analyzing vectorizable control flow: %G",
    4103         2630 :                          root_stmt_infos[0]->stmt);
    4104              :     }
    4105              : 
    4106      1763358 :   if (dump_enabled_p ())
    4107              :     {
    4108        24636 :       dump_printf_loc (MSG_NOTE, vect_location,
    4109              :                        "Starting SLP discovery for\n");
    4110        52615 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4111        55958 :         dump_printf_loc (MSG_NOTE, vect_location,
    4112        27979 :                          "  %G", scalar_stmts[i]->stmt);
    4113              :     }
    4114              : 
    4115              :   /* Build the tree for the SLP instance.  */
    4116      1763358 :   unsigned int group_size = scalar_stmts.length ();
    4117      1763358 :   bool *matches = XALLOCAVEC (bool, group_size);
    4118      1763358 :   poly_uint64 max_nunits = 1;
    4119      1763358 :   unsigned tree_size = 0;
    4120              : 
    4121      1763358 :   slp_tree node = NULL;
    4122      1763358 :   if (group_size > 1 && force_single_lane)
    4123              :     {
    4124            0 :       matches[0] = true;
    4125            0 :       matches[1] = false;
    4126              :     }
    4127              :   else
    4128      1763358 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4129              :                                 &max_nunits, matches, limit,
    4130              :                                 &tree_size, bst_map);
    4131      1763358 :   if (node != NULL)
    4132              :     {
    4133              :       /* Calculate the unrolling factor based on the smallest type.  */
    4134       700502 :       poly_uint64 unrolling_factor
    4135       700502 :         = calculate_unrolling_factor (max_nunits, group_size);
    4136              : 
    4137       700502 :       if (maybe_ne (unrolling_factor, 1U)
    4138       700502 :           && is_a <bb_vec_info> (vinfo))
    4139              :         {
    4140            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    4141            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    4142            0 :               || const_max_nunits > group_size)
    4143              :             {
    4144            0 :               if (dump_enabled_p ())
    4145            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4146              :                                  "Build SLP failed: store group "
    4147              :                                  "size not a multiple of the vector size "
    4148              :                                  "in basic block SLP\n");
    4149            0 :               vect_free_slp_tree (node);
    4150            0 :               return false;
    4151              :             }
    4152              :           /* Fatal mismatch.  */
    4153            0 :           if (dump_enabled_p ())
    4154            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    4155              :                              "SLP discovery succeeded but node needs "
    4156              :                              "splitting\n");
    4157            0 :           memset (matches, true, group_size);
    4158            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    4159            0 :           vect_free_slp_tree (node);
    4160              :         }
    4161              :       else
    4162              :         {
    4163              :           /* Create a new SLP instance.  */
    4164       700502 :           slp_instance new_instance = XNEW (class _slp_instance);
    4165       700502 :           SLP_INSTANCE_TREE (new_instance) = node;
    4166       700502 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4167       700502 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4168       700502 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4169       700502 :           SLP_INSTANCE_KIND (new_instance) = kind;
    4170       700502 :           new_instance->reduc_phis = NULL;
    4171       700502 :           new_instance->cost_vec = vNULL;
    4172       700502 :           new_instance->subgraph_entries = vNULL;
    4173              : 
    4174       700502 :           if (dump_enabled_p ())
    4175        21654 :             dump_printf_loc (MSG_NOTE, vect_location,
    4176              :                              "SLP size %u vs. limit %u.\n",
    4177              :                              tree_size, max_tree_size);
    4178              : 
    4179       700502 :           vinfo->slp_instances.safe_push (new_instance);
    4180              : 
    4181              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4182              :              the number of scalar stmts in the root in a few places.
    4183              :              Verify that assumption holds.  */
    4184      1401004 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4185              :                         .length () == group_size);
    4186              : 
    4187       700502 :           if (dump_enabled_p ())
    4188              :             {
    4189        21654 :               if (kind == slp_inst_kind_reduc_group)
    4190         1407 :                 dump_printf_loc (MSG_NOTE, vect_location,
    4191              :                                  "SLP discovery of size %d reduction group "
    4192              :                                  "succeeded\n", group_size);
    4193        21654 :               dump_printf_loc (MSG_NOTE, vect_location,
    4194              :                                "Final SLP tree for instance %p:\n",
    4195              :                                (void *) new_instance);
    4196        21654 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    4197              :                                     SLP_INSTANCE_TREE (new_instance));
    4198              :             }
    4199              : 
    4200       700502 :           return true;
    4201              :         }
    4202              :     }
    4203              :   /* Failed to SLP.  */
    4204              : 
    4205              :   /* While we arrive here even with slp_inst_kind_store we should only
    4206              :      for group_size == 1.  The code to split store groups is only in
    4207              :      vect_analyze_slp_instance now.  */
    4208      1062856 :   gcc_assert (kind != slp_inst_kind_store || group_size == 1);
    4209              : 
    4210              :   /* Free the allocated memory.  */
    4211      1062856 :   scalar_stmts.release ();
    4212              : 
    4213              :   /* Failed to SLP.  */
    4214      1062856 :   if (dump_enabled_p ())
    4215         2982 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4216              :   return false;
    4217              : }
    4218              : 
    4219              : /* Analyze an SLP instance starting from a the start of a reduction chain.
    4220              :    Call vect_build_slp_tree to build a tree of packed stmts if possible.
    4221              :    Return FALSE if SLP build fails.  */
    4222              : 
    4223              : static bool
    4224        42762 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
    4225              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    4226              :                               stmt_vec_info scalar_stmt,
    4227              :                               unsigned max_tree_size, unsigned *limit)
    4228              : {
    4229        42762 :   vec<stmt_vec_info> scalar_stmts = vNULL;
    4230              : 
    4231        42762 :   bool fail = false;
    4232              :   /* ???  We could leave operation code checking to SLP discovery.  */
    4233        42762 :   code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
    4234              :                                               (vect_orig_stmt (scalar_stmt)));
    4235        42762 :   bool first = true;
    4236        42762 :   stmt_vec_info next_stmt = scalar_stmt;
    4237        47906 :   do
    4238              :     {
    4239        47906 :       stmt_vec_info stmt = next_stmt;
    4240        47906 :       gimple_match_op op;
    4241        47906 :       if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
    4242            0 :         gcc_unreachable ();
    4243        95812 :       tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
    4244        47906 :                                    STMT_VINFO_REDUC_IDX (stmt));
    4245        47906 :       next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
    4246        47906 :       gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
    4247              :                   || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
    4248        51310 :       if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
    4249            0 :         gcc_unreachable ();
    4250        47906 :       if (CONVERT_EXPR_CODE_P (op.code)
    4251         2149 :           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
    4252        50043 :           && (first
    4253         1058 :               || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
    4254              :         ;
    4255        45771 :       else if (code != op.code)
    4256              :         {
    4257         1718 :           fail = true;
    4258         1718 :           break;
    4259              :         }
    4260              :       else
    4261        44053 :         scalar_stmts.safe_push (stmt);
    4262        46188 :       first = false;
    4263              :     }
    4264        46188 :   while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
    4265        42762 :   if (fail)
    4266         1718 :     return false;
    4267              : 
    4268              :   /* Remember a stmt with the actual reduction operation.  */
    4269        41044 :   stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
    4270              : 
    4271              :   /* When the SSA def chain through reduc-idx does not form a natural
    4272              :      reduction chain try to linearize an associative operation manually.  */
    4273        41044 :   if (scalar_stmts.length () == 1
    4274        39389 :       && code.is_tree_code ()
    4275        36003 :       && associative_tree_code ((tree_code)code)
    4276              :       /* We may not associate if a fold-left reduction is required.  */
    4277        76176 :       && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
    4278              :                                                     (reduc_scalar_stmt->stmt)),
    4279              :                                        code))
    4280              :     {
    4281        33308 :       auto_vec<chain_op_t> chain;
    4282        33308 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    4283        33308 :       gimple *op_stmt = NULL, *other_op_stmt = NULL;
    4284        33308 :       vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4285        33308 :                                 scalar_stmts[0]->stmt, op_stmt, other_op_stmt,
    4286              :                                 NULL);
    4287              : 
    4288        33308 :       scalar_stmts.truncate (0);
    4289        33308 :       stmt_vec_info tail = NULL;
    4290       165781 :       for (auto el : chain)
    4291              :         {
    4292        66539 :           if (el.dt == vect_external_def
    4293        66539 :               || el.dt == vect_constant_def
    4294        66539 :               || el.code != (tree_code) code)
    4295              :             {
    4296          682 :               scalar_stmts.release ();
    4297          682 :               return false;
    4298              :             }
    4299        65857 :           stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4300        65857 :           if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4301        64890 :               || STMT_VINFO_REDUC_DEF (stmt))
    4302              :             {
    4303        32802 :               gcc_assert (tail == NULL);
    4304        32802 :               tail = stmt;
    4305        32802 :               continue;
    4306              :             }
    4307        33055 :           scalar_stmts.safe_push (stmt);
    4308              :         }
    4309        32626 :       gcc_assert (tail);
    4310              : 
    4311              :       /* When this linearization didn't produce a chain see if stripping
    4312              :          a wrapping sign conversion produces one.  */
    4313        32626 :       if (scalar_stmts.length () == 1
    4314        32626 :           && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
    4315              :               || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
    4316              :         {
    4317        31336 :           gimple *stmt = scalar_stmts[0]->stmt;
    4318        31336 :           if (!is_gimple_assign (stmt)
    4319        30296 :               || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
    4320         3917 :               || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
    4321        35253 :               || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    4322         3917 :                                          TREE_TYPE (gimple_assign_rhs1 (stmt))))
    4323              :             {
    4324        29856 :               scalar_stmts.release ();
    4325        29856 :               return false;
    4326              :             }
    4327         1480 :           stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
    4328         1480 :           if (!is_gimple_assign (stmt)
    4329         1480 :               || gimple_assign_rhs_code (stmt) != (tree_code)code)
    4330              :             {
    4331         1462 :               scalar_stmts.release ();
    4332         1462 :               return false;
    4333              :             }
    4334           18 :           chain.truncate (0);
    4335           18 :           vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4336              :                                     stmt, op_stmt, other_op_stmt, NULL);
    4337              : 
    4338           18 :           scalar_stmts.truncate (0);
    4339           18 :           tail = NULL;
    4340           88 :           for (auto el : chain)
    4341              :             {
    4342           42 :               if (el.dt == vect_external_def
    4343           42 :                   || el.dt == vect_constant_def
    4344           42 :                   || el.code != (tree_code) code)
    4345              :                 {
    4346            8 :                   scalar_stmts.release ();
    4347            8 :                   return false;
    4348              :                 }
    4349           34 :               stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4350           34 :               if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4351           34 :                   || STMT_VINFO_REDUC_DEF (stmt))
    4352              :                 {
    4353            0 :                   gcc_assert (tail == NULL);
    4354            0 :                   tail = stmt;
    4355            0 :                   continue;
    4356              :                 }
    4357           34 :               scalar_stmts.safe_push (stmt);
    4358              :             }
    4359              :           /* Unlike the above this does not include the reduction SSA
    4360              :              cycle.  */
    4361           10 :           gcc_assert (!tail);
    4362              :         }
    4363              : 
    4364         1300 :       if (scalar_stmts.length () < 2)
    4365              :         {
    4366         1207 :           scalar_stmts.release ();
    4367         1207 :           return false;
    4368              :         }
    4369              : 
    4370           93 :       if (dump_enabled_p ())
    4371              :         {
    4372           34 :           dump_printf_loc (MSG_NOTE, vect_location,
    4373              :                            "Starting SLP discovery of reduction chain for\n");
    4374          140 :           for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4375          212 :             dump_printf_loc (MSG_NOTE, vect_location,
    4376          106 :                              "  %G", scalar_stmts[i]->stmt);
    4377              :         }
    4378              : 
    4379           93 :       unsigned int group_size = scalar_stmts.length ();
    4380           93 :       bool *matches = XALLOCAVEC (bool, group_size);
    4381           93 :       poly_uint64 max_nunits = 1;
    4382           93 :       unsigned tree_size = 0;
    4383           93 :       slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4384              :                                            &max_nunits, matches, limit,
    4385           93 :                                            &tree_size, bst_map);
    4386           93 :       if (!node)
    4387              :         {
    4388           37 :           scalar_stmts.release ();
    4389           37 :           return false;
    4390              :         }
    4391              : 
    4392           56 :       unsigned cycle_id = vinfo->reduc_infos.length ();
    4393           56 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    4394           56 :       vinfo->reduc_infos.safe_push (reduc_info);
    4395           56 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
    4396           56 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
    4397           56 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
    4398           56 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    4399           56 :       reduc_info->is_reduc_chain = true;
    4400              : 
    4401              :       /* Build the node for the PHI and possibly the conversions.  */
    4402           56 :       slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
    4403           56 :       SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
    4404           56 :       phis->cycle_info.id = cycle_id;
    4405           56 :       SLP_TREE_LANES (phis) = group_size;
    4406           56 :       if (reduc_scalar_stmt == scalar_stmt)
    4407           52 :         SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
    4408              :       else
    4409            4 :         SLP_TREE_VECTYPE (phis)
    4410            4 :           = signed_or_unsigned_type_for (TYPE_UNSIGNED
    4411              :                                            (TREE_TYPE (gimple_get_lhs
    4412              :                                                          (scalar_stmt->stmt))),
    4413              :                                          SLP_TREE_VECTYPE (node));
    4414              :       /* ???  vect_cse_slp_nodes cannot cope with cycles without any
    4415              :          SLP_TREE_SCALAR_STMTS.  */
    4416           56 :       SLP_TREE_SCALAR_STMTS (phis).create (group_size);
    4417          235 :       for (unsigned i = 0; i < group_size; ++i)
    4418          179 :         SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
    4419              : 
    4420           56 :       slp_tree op_input = phis;
    4421           56 :       if (reduc_scalar_stmt != scalar_stmt)
    4422              :         {
    4423            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4424            4 :           SLP_TREE_REPRESENTATIVE (conv)
    4425            4 :             = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
    4426            4 :                                              STMT_VINFO_REDUC_IDX
    4427              :                                                (reduc_scalar_stmt)));
    4428            4 :           SLP_TREE_CHILDREN (conv).quick_push (phis);
    4429            4 :           conv->cycle_info.id = cycle_id;
    4430            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4431            4 :           SLP_TREE_LANES (conv) = group_size;
    4432            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
    4433            4 :           SLP_TREE_SCALAR_STMTS (conv) = vNULL;
    4434            4 :           op_input = conv;
    4435              :         }
    4436              : 
    4437           56 :       slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
    4438           56 :       SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
    4439           56 :       SLP_TREE_CHILDREN (reduc).quick_push (op_input);
    4440           56 :       SLP_TREE_CHILDREN (reduc).quick_push (node);
    4441           56 :       reduc->cycle_info.id = cycle_id;
    4442           56 :       SLP_TREE_REDUC_IDX (reduc) = 0;
    4443           56 :       SLP_TREE_LANES (reduc) = group_size;
    4444           56 :       SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
    4445              :       /* ???  For the reduction epilogue we need a live lane.  */
    4446           56 :       SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
    4447           56 :       SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
    4448          179 :       for (unsigned i = 1; i < group_size; ++i)
    4449          123 :         SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
    4450              : 
    4451           56 :       if (reduc_scalar_stmt != scalar_stmt)
    4452              :         {
    4453            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4454            4 :           SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
    4455            4 :           SLP_TREE_CHILDREN (conv).quick_push (reduc);
    4456            4 :           conv->cycle_info.id = cycle_id;
    4457            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4458            4 :           SLP_TREE_LANES (conv) = group_size;
    4459            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
    4460              :           /* ???  For the reduction epilogue we need a live lane.  */
    4461            4 :           SLP_TREE_SCALAR_STMTS (conv).create (group_size);
    4462            4 :           SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
    4463            8 :           for (unsigned i = 1; i < group_size; ++i)
    4464            4 :             SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
    4465            4 :           reduc = conv;
    4466              :         }
    4467              : 
    4468           56 :       edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
    4469           56 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4470           56 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4471           56 :       SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
    4472           56 :       SLP_TREE_REF_COUNT (reduc)++;
    4473              : 
    4474              :       /* Create a new SLP instance.  */
    4475           56 :       slp_instance new_instance = XNEW (class _slp_instance);
    4476           56 :       SLP_INSTANCE_TREE (new_instance) = reduc;
    4477           56 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4478           56 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4479           56 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4480           56 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4481           56 :       new_instance->reduc_phis = NULL;
    4482           56 :       new_instance->cost_vec = vNULL;
    4483           56 :       new_instance->subgraph_entries = vNULL;
    4484              : 
    4485           56 :       vinfo->slp_instances.safe_push (new_instance);
    4486              : 
    4487           56 :       if (dump_enabled_p ())
    4488              :         {
    4489           24 :           dump_printf_loc (MSG_NOTE, vect_location,
    4490              :                            "Final SLP tree for instance %p:\n",
    4491              :                            (void *) new_instance);
    4492           24 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4493              :                                 SLP_INSTANCE_TREE (new_instance));
    4494              :         }
    4495              : 
    4496           56 :       return true;
    4497        33308 :     }
    4498              : 
    4499         7736 :   if (scalar_stmts.length () <= 1)
    4500              :     {
    4501         6081 :       scalar_stmts.release ();
    4502         6081 :       return false;
    4503              :     }
    4504              : 
    4505         1655 :   scalar_stmts.reverse ();
    4506         1655 :   stmt_vec_info reduc_phi_info = next_stmt;
    4507              : 
    4508              :   /* Build the tree for the SLP instance.  */
    4509         1655 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    4510         1655 :   vec<tree> remain = vNULL;
    4511              : 
    4512         1655 :   if (dump_enabled_p ())
    4513              :     {
    4514          180 :       dump_printf_loc (MSG_NOTE, vect_location,
    4515              :                        "Starting SLP discovery of reduction chain for\n");
    4516          966 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4517         1572 :         dump_printf_loc (MSG_NOTE, vect_location,
    4518          786 :                          "  %G", scalar_stmts[i]->stmt);
    4519              :     }
    4520              : 
    4521              :   /* Build the tree for the SLP instance.  */
    4522         1655 :   unsigned int group_size = scalar_stmts.length ();
    4523         1655 :   bool *matches = XALLOCAVEC (bool, group_size);
    4524         1655 :   poly_uint64 max_nunits = 1;
    4525         1655 :   unsigned tree_size = 0;
    4526              : 
    4527              :   /* ???  We need this only for SLP discovery.  */
    4528         6315 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4529         4660 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
    4530              : 
    4531         1655 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4532              :                                        &max_nunits, matches, limit,
    4533         1655 :                                        &tree_size, bst_map);
    4534              : 
    4535         6315 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4536         4660 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
    4537              : 
    4538         1655 :   if (node != NULL)
    4539              :     {
    4540              :       /* Create a new SLP instance.  */
    4541         1395 :       slp_instance new_instance = XNEW (class _slp_instance);
    4542         1395 :       SLP_INSTANCE_TREE (new_instance) = node;
    4543         1395 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4544         1395 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4545         1395 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4546         1395 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4547         1395 :       new_instance->reduc_phis = NULL;
    4548         1395 :       new_instance->cost_vec = vNULL;
    4549         1395 :       new_instance->subgraph_entries = vNULL;
    4550              : 
    4551         1395 :       vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
    4552         1395 :       reduc_info->is_reduc_chain = true;
    4553              : 
    4554         1395 :       if (dump_enabled_p ())
    4555          135 :         dump_printf_loc (MSG_NOTE, vect_location,
    4556              :                          "SLP size %u vs. limit %u.\n",
    4557              :                          tree_size, max_tree_size);
    4558              : 
    4559              :       /* Fixup SLP reduction chains.  If this is a reduction chain with
    4560              :          a conversion in front amend the SLP tree with a node for that.  */
    4561         1395 :       gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
    4562         1395 :       if (is_gimple_assign (scalar_def)
    4563         1395 :           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
    4564              :         {
    4565           28 :           stmt_vec_info conv_info = vect_stmt_to_vectorize
    4566           28 :                                         (STMT_VINFO_REDUC_DEF (reduc_phi_info));
    4567           28 :           scalar_stmts = vNULL;
    4568           28 :           scalar_stmts.create (group_size);
    4569           90 :           for (unsigned i = 0; i < group_size; ++i)
    4570           62 :             scalar_stmts.quick_push (conv_info);
    4571           28 :           slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
    4572           28 :           SLP_TREE_VECTYPE (conv)
    4573           28 :             = get_vectype_for_scalar_type (vinfo,
    4574           28 :                                            TREE_TYPE
    4575              :                                              (gimple_assign_lhs (scalar_def)),
    4576              :                                            group_size);
    4577           28 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4578           28 :           conv->cycle_info.id = node->cycle_info.id;
    4579           28 :           SLP_TREE_CHILDREN (conv).quick_push (node);
    4580           28 :           SLP_INSTANCE_TREE (new_instance) = conv;
    4581              :         }
    4582              :       /* Fill the backedge child of the PHI SLP node.  The
    4583              :          general matching code cannot find it because the
    4584              :          scalar code does not reflect how we vectorize the
    4585              :          reduction.  */
    4586         1395 :       use_operand_p use_p;
    4587         1395 :       imm_use_iterator imm_iter;
    4588         1395 :       class loop *loop = LOOP_VINFO_LOOP (vinfo);
    4589         6670 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
    4590              :                              gimple_get_lhs (scalar_def))
    4591              :         /* There are exactly two non-debug uses, the reduction
    4592              :            PHI and the loop-closed PHI node.  */
    4593         3880 :         if (!is_gimple_debug (USE_STMT (use_p))
    4594         3880 :             && gimple_bb (USE_STMT (use_p)) == loop->header)
    4595              :           {
    4596         1395 :             auto_vec<stmt_vec_info, 64> phis (group_size);
    4597         1395 :             stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
    4598         5386 :             for (unsigned i = 0; i < group_size; ++i)
    4599         3991 :               phis.quick_push (phi_info);
    4600         1395 :             slp_tree *phi_node = bst_map->get (phis);
    4601         1395 :             unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
    4602         2790 :             SLP_TREE_CHILDREN (*phi_node)[dest_idx]
    4603         1395 :               = SLP_INSTANCE_TREE (new_instance);
    4604         1395 :             SLP_INSTANCE_TREE (new_instance)->refcnt++;
    4605         1395 :           }
    4606              : 
    4607         1395 :       vinfo->slp_instances.safe_push (new_instance);
    4608              : 
    4609              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4610              :          the number of scalar stmts in the root in a few places.
    4611              :          Verify that assumption holds.  */
    4612         2790 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4613              :                   .length () == group_size);
    4614              : 
    4615         1395 :       if (dump_enabled_p ())
    4616              :         {
    4617          135 :           dump_printf_loc (MSG_NOTE, vect_location,
    4618              :                            "Final SLP tree for instance %p:\n",
    4619              :                            (void *) new_instance);
    4620          135 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4621              :                                 SLP_INSTANCE_TREE (new_instance));
    4622              :         }
    4623              : 
    4624         1395 :       return true;
    4625              :     }
    4626              : 
    4627              :   /* Failed to SLP.  */
    4628          260 :   scalar_stmts.release ();
    4629          260 :   if (dump_enabled_p ())
    4630           45 :     dump_printf_loc (MSG_NOTE, vect_location,
    4631              :                      "SLP discovery of reduction chain failed\n");
    4632              :   return false;
    4633              : }
    4634              : 
    4635              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4636              :    of KIND.  Return true if successful.  */
    4637              : 
    4638              : static bool
    4639        63542 : vect_analyze_slp_reduction (loop_vec_info vinfo,
    4640              :                             stmt_vec_info scalar_stmt,
    4641              :                             unsigned max_tree_size, unsigned *limit,
    4642              :                             scalar_stmts_to_slp_tree_map_t *bst_map,
    4643              :                             bool force_single_lane)
    4644              : {
    4645        63542 :   slp_instance_kind kind = slp_inst_kind_reduc_group;
    4646              : 
    4647              :   /* If there's no budget left bail out early.  */
    4648        63542 :   if (*limit == 0)
    4649              :     return false;
    4650              : 
    4651              :   /* Try to gather a reduction chain.  */
    4652        63542 :   if (! force_single_lane
    4653        42979 :       && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
    4654       106304 :       && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
    4655              :                                        max_tree_size, limit))
    4656              :     return true;
    4657              : 
    4658        62091 :   vec<stmt_vec_info> scalar_stmts;
    4659        62091 :   scalar_stmts.create (1);
    4660        62091 :   scalar_stmts.quick_push (scalar_stmt);
    4661              : 
    4662        62091 :   if (dump_enabled_p ())
    4663              :     {
    4664         3338 :       dump_printf_loc (MSG_NOTE, vect_location,
    4665              :                        "Starting SLP discovery for\n");
    4666         6676 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4667         6676 :         dump_printf_loc (MSG_NOTE, vect_location,
    4668         3338 :                          "  %G", scalar_stmts[i]->stmt);
    4669              :     }
    4670              : 
    4671              :   /* Build the tree for the SLP instance.  */
    4672        62091 :   unsigned int group_size = scalar_stmts.length ();
    4673        62091 :   bool *matches = XALLOCAVEC (bool, group_size);
    4674        62091 :   poly_uint64 max_nunits = 1;
    4675        62091 :   unsigned tree_size = 0;
    4676              : 
    4677        62091 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4678              :                                        &max_nunits, matches, limit,
    4679              :                                        &tree_size, bst_map);
    4680        62091 :   if (node != NULL)
    4681              :     {
    4682              :       /* Create a new SLP instance.  */
    4683        59506 :       slp_instance new_instance = XNEW (class _slp_instance);
    4684        59506 :       SLP_INSTANCE_TREE (new_instance) = node;
    4685        59506 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4686        59506 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4687        59506 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4688        59506 :       SLP_INSTANCE_KIND (new_instance) = kind;
    4689        59506 :       new_instance->reduc_phis = NULL;
    4690        59506 :       new_instance->cost_vec = vNULL;
    4691        59506 :       new_instance->subgraph_entries = vNULL;
    4692              : 
    4693        59506 :       if (dump_enabled_p ())
    4694         3222 :         dump_printf_loc (MSG_NOTE, vect_location,
    4695              :                          "SLP size %u vs. limit %u.\n",
    4696              :                          tree_size, max_tree_size);
    4697              : 
    4698        59506 :       vinfo->slp_instances.safe_push (new_instance);
    4699              : 
    4700              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4701              :          the number of scalar stmts in the root in a few places.
    4702              :          Verify that assumption holds.  */
    4703       119012 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4704              :                   .length () == group_size);
    4705              : 
    4706        59506 :       if (dump_enabled_p ())
    4707              :         {
    4708         3222 :           dump_printf_loc (MSG_NOTE, vect_location,
    4709              :                            "Final SLP tree for instance %p:\n",
    4710              :                            (void *) new_instance);
    4711         3222 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4712              :                                 SLP_INSTANCE_TREE (new_instance));
    4713              :         }
    4714              : 
    4715        59506 :       return true;
    4716              :     }
    4717              :   /* Failed to SLP.  */
    4718              : 
    4719              :   /* Free the allocated memory.  */
    4720         2585 :   scalar_stmts.release ();
    4721              : 
    4722              :   /* Failed to SLP.  */
    4723         2585 :   if (dump_enabled_p ())
    4724          116 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4725              :   return false;
    4726              : }
    4727              : 
    4728              : /* Analyze a single SLP reduction group.  If successful add a SLP instance
    4729              :    for it and return true, otherwise return false and have *MATCHES
    4730              :    populated.  */
    4731              : 
    4732              : static bool
    4733        18143 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
    4734              :                                   vec<stmt_vec_info> scalar_stmts,
    4735              :                                   scalar_stmts_to_slp_tree_map_t *bst_map,
    4736              :                                   unsigned max_tree_size, unsigned *limit,
    4737              :                                   bool *matches)
    4738              : {
    4739              :   /* Try to form a reduction group.  */
    4740        18143 :   unsigned int group_size = scalar_stmts.length ();
    4741        18143 :   if (!matches)
    4742         7417 :     matches = XALLOCAVEC (bool, group_size);
    4743        18143 :   poly_uint64 max_nunits = 1;
    4744        18143 :   unsigned tree_size = 0;
    4745        18143 :   slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
    4746              :                                        group_size,
    4747              :                                        &max_nunits, matches, limit,
    4748              :                                        &tree_size, bst_map);
    4749        18143 :   if (!node)
    4750              :     return false;
    4751              : 
    4752              :   /* Create a new SLP instance.  */
    4753         8601 :   slp_instance new_instance = XNEW (class _slp_instance);
    4754         8601 :   SLP_INSTANCE_TREE (new_instance) = node;
    4755         8601 :   SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4756         8601 :   SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4757         8601 :   SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4758         8601 :   SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
    4759         8601 :   new_instance->reduc_phis = NULL;
    4760         8601 :   new_instance->cost_vec = vNULL;
    4761         8601 :   new_instance->subgraph_entries = vNULL;
    4762              : 
    4763         8601 :   if (dump_enabled_p ())
    4764          544 :     dump_printf_loc (MSG_NOTE, vect_location,
    4765              :                      "SLP size %u vs. limit %u.\n",
    4766              :                      tree_size, max_tree_size);
    4767              : 
    4768         8601 :   loop_vinfo->slp_instances.safe_push (new_instance);
    4769              : 
    4770              :   /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4771              :      the number of scalar stmts in the root in a few places.
    4772              :      Verify that assumption holds.  */
    4773        17202 :   gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4774              :               .length () == group_size);
    4775              : 
    4776         8601 :   if (dump_enabled_p ())
    4777              :     {
    4778          544 :       dump_printf_loc (MSG_NOTE, vect_location,
    4779              :                        "SLP discovery of size %d reduction group "
    4780              :                        "succeeded\n", group_size);
    4781          544 :       dump_printf_loc (MSG_NOTE, vect_location,
    4782              :                        "Final SLP tree for instance %p:\n",
    4783              :                        (void *) new_instance);
    4784          544 :       vect_print_slp_graph (MSG_NOTE, vect_location,
    4785              :                             SLP_INSTANCE_TREE (new_instance));
    4786              :     }
    4787              : 
    4788              :   return true;
    4789              : }
    4790              : 
    4791              : /* Analyze reductions in LOOP_VINFO and populate SLP instances
    4792              :    accordingly.  Returns false if something fails.  */
    4793              : 
    4794              : static bool
    4795       422764 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
    4796              :                              unsigned max_tree_size, unsigned *limit,
    4797              :                              scalar_stmts_to_slp_tree_map_t *bst_map,
    4798              :                              bool force_single_lane)
    4799              : {
    4800       470067 :   if (loop_vinfo->reductions.is_empty ())
    4801              :     return true;
    4802              : 
    4803              :   /* Collect reduction statements we can combine into
    4804              :      a SLP reduction.  */
    4805        53093 :   vec<stmt_vec_info> scalar_stmts;
    4806        53093 :   scalar_stmts.create (loop_vinfo->reductions.length ());
    4807       234085 :   for (auto next_info : loop_vinfo->reductions)
    4808              :     {
    4809        74806 :       next_info = vect_stmt_to_vectorize (next_info);
    4810        74806 :       if ((STMT_VINFO_RELEVANT_P (next_info)
    4811           14 :            || STMT_VINFO_LIVE_P (next_info))
    4812              :           /* ???  Make sure we didn't skip a conversion around a
    4813              :              reduction path.  In that case we'd have to reverse
    4814              :              engineer that conversion stmt following the chain using
    4815              :              reduc_idx and from the PHI using reduc_def.  */
    4816        74792 :           && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
    4817        74792 :               || (STMT_VINFO_DEF_TYPE (next_info)
    4818              :                   == vect_double_reduction_def)))
    4819              :         {
    4820              :           /* Do not discover SLP reductions combining lane-reducing
    4821              :              ops, that will fail later.  */
    4822        74792 :           if (!force_single_lane
    4823        74792 :               && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
    4824        53794 :             scalar_stmts.quick_push (next_info);
    4825              :           /* Do SLP discovery for single-lane reductions.  */
    4826        20998 :           else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
    4827              :                                                  max_tree_size, limit,
    4828              :                                                  bst_map,
    4829              :                                                  force_single_lane))
    4830              :             {
    4831            0 :               scalar_stmts.release ();
    4832            0 :               return false;
    4833              :             }
    4834              :         }
    4835              :     }
    4836              : 
    4837        53093 :   if (scalar_stmts.length () > 1)
    4838              :     {
    4839              :       /* Try to form a reduction group.  */
    4840         3331 :       unsigned int group_size = scalar_stmts.length ();
    4841         3331 :       bool *matches = XALLOCAVEC (bool, group_size);
    4842         3331 :       if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
    4843              :                                             max_tree_size, limit, matches))
    4844         3227 :         return true;
    4845              : 
    4846              :       /* When analysis as a single SLP reduction group failed try to
    4847              :          form sub-groups by collecting matching lanes.  Do not recurse
    4848              :          that on failure (to limit compile-time costs), but recurse
    4849              :          for the initial non-matching parts.  Everything not covered
    4850              :          by a sub-group gets single-reduction treatment.  */
    4851         2418 :       vec<stmt_vec_info> cands = vNULL;
    4852         7521 :       while (matches[0])
    4853              :         {
    4854         7417 :           cands.truncate (0);
    4855         7417 :           cands.reserve (group_size, true);
    4856        58074 :           for (unsigned i = 0; i < group_size; ++i)
    4857        50657 :             if (matches[i])
    4858        12395 :               cands.quick_push (scalar_stmts[i]);
    4859              : 
    4860              :           /* Try to form a reduction group.  */
    4861         7417 :           if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
    4862              :                                                 max_tree_size, limit, NULL))
    4863         5396 :             cands = vNULL;
    4864              :           else
    4865              :             {
    4866              :               /* Do SLP discovery for single-lane reductions.  */
    4867        12272 :               for (auto stmt_info : cands)
    4868         6231 :                 if (! vect_analyze_slp_reduction (loop_vinfo,
    4869              :                                                   vect_stmt_to_vectorize
    4870              :                                                     (stmt_info),
    4871              :                                                   max_tree_size, limit,
    4872              :                                                   bst_map, force_single_lane))
    4873              :                   {
    4874           22 :                     scalar_stmts.release ();
    4875           22 :                     cands.release ();
    4876           22 :                     return false;
    4877              :                   }
    4878              :             }
    4879              :           /* Remove the handled stmts from scalar_stmts and try again,
    4880              :              possibly repeating the above with updated matches[].  */
    4881              :           unsigned j = 0;
    4882        57990 :           for (unsigned i = 0; i < group_size; ++i)
    4883        50595 :             if (!matches[i])
    4884              :               {
    4885        38235 :                 scalar_stmts[j] = scalar_stmts[i];
    4886        38235 :                 ++j;
    4887              :               }
    4888         7395 :           scalar_stmts.truncate (j);
    4889         7395 :           group_size = scalar_stmts.length ();
    4890         7395 :           if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
    4891              :                                                 bst_map, max_tree_size, limit,
    4892              :                                                 matches))
    4893              :             return true;
    4894              :         }
    4895              :     }
    4896              :   /* Do SLP discovery for single-lane reductions.  */
    4897       183348 :   for (auto stmt_info : scalar_stmts)
    4898        36313 :     if (! vect_analyze_slp_reduction (loop_vinfo,
    4899              :                                       vect_stmt_to_vectorize (stmt_info),
    4900              :                                       max_tree_size, limit,
    4901              :                                       bst_map, force_single_lane))
    4902              :       {
    4903         2563 :         scalar_stmts.release ();
    4904         2563 :         return false;
    4905              :       }
    4906              : 
    4907        47303 :   scalar_stmts.release ();
    4908        47303 :   return true;
    4909              : }
    4910              : 
    4911              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    4912              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    4913              :    Return FALSE if it's impossible to SLP any stmt in the group.  */
    4914              : 
    4915              : static bool
    4916      1082258 : vect_analyze_slp_instance (vec_info *vinfo,
    4917              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    4918              :                            stmt_vec_info stmt_info,
    4919              :                            slp_instance_kind kind,
    4920              :                            unsigned max_tree_size, unsigned *limit,
    4921              :                            bool force_single_lane)
    4922              : {
    4923      1082258 :   vec<stmt_vec_info> scalar_stmts;
    4924              : 
    4925      1082258 :   if (is_a <bb_vec_info> (vinfo))
    4926      1058542 :     vect_location = stmt_info->stmt;
    4927              : 
    4928      1082258 :   gcc_assert (kind == slp_inst_kind_store);
    4929              : 
    4930              :   /* Collect the stores and store them in scalar_stmts.  */
    4931      1082258 :   scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
    4932      1082258 :   stmt_vec_info next_info = stmt_info;
    4933      5371221 :   while (next_info)
    4934              :     {
    4935      3206705 :       scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
    4936      3206705 :       next_info = DR_GROUP_NEXT_ELEMENT (next_info);
    4937              :     }
    4938              : 
    4939      1082258 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    4940      1082258 :   vec<tree> remain = vNULL;
    4941              : 
    4942              :   /* Build the tree for the SLP instance.  */
    4943              : 
    4944              :   /* If there's no budget left bail out early.  */
    4945      1082258 :   if (*limit == 0)
    4946              :     return false;
    4947              : 
    4948      1082235 :   if (dump_enabled_p ())
    4949              :     {
    4950         4111 :       dump_printf_loc (MSG_NOTE, vect_location,
    4951              :                        "Starting SLP discovery for\n");
    4952        23684 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4953        39146 :         dump_printf_loc (MSG_NOTE, vect_location,
    4954        19573 :                          "  %G", scalar_stmts[i]->stmt);
    4955              :     }
    4956              : 
    4957              :   /* Build the tree for the SLP instance.  */
    4958      1082235 :   unsigned int group_size = scalar_stmts.length ();
    4959      1082235 :   bool *matches = XALLOCAVEC (bool, group_size);
    4960      1082235 :   poly_uint64 max_nunits = 1;
    4961      1082235 :   unsigned tree_size = 0;
    4962      1082235 :   unsigned i;
    4963              : 
    4964      1082235 :   slp_tree node = NULL;
    4965      1082235 :   if (group_size > 1 && force_single_lane)
    4966              :     {
    4967         1498 :       matches[0] = true;
    4968         1498 :       matches[1] = false;
    4969              :     }
    4970              :   else
    4971      1080737 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4972              :                                 &max_nunits, matches, limit,
    4973              :                                 &tree_size, bst_map);
    4974      1082235 :   if (node != NULL)
    4975              :     {
    4976              :       /* Calculate the unrolling factor based on the smallest type.  */
    4977       672014 :       poly_uint64 unrolling_factor
    4978       672014 :         = calculate_unrolling_factor (max_nunits, group_size);
    4979              : 
    4980       672014 :       if (maybe_ne (unrolling_factor, 1U)
    4981       672014 :           && is_a <bb_vec_info> (vinfo))
    4982              :         {
    4983            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    4984            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    4985            0 :               || const_max_nunits > group_size)
    4986              :             {
    4987            0 :               if (dump_enabled_p ())
    4988            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4989              :                                  "Build SLP failed: store group "
    4990              :                                  "size not a multiple of the vector size "
    4991              :                                  "in basic block SLP\n");
    4992            0 :               vect_free_slp_tree (node);
    4993            0 :               return false;
    4994              :             }
    4995              :           /* Fatal mismatch.  */
    4996            0 :           if (dump_enabled_p ())
    4997            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    4998              :                              "SLP discovery succeeded but node needs "
    4999              :                              "splitting\n");
    5000            0 :           memset (matches, true, group_size);
    5001            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    5002            0 :           vect_free_slp_tree (node);
    5003              :         }
    5004              :       else
    5005              :         {
    5006              :           /* Create a new SLP instance.  */
    5007       672014 :           slp_instance new_instance = XNEW (class _slp_instance);
    5008       672014 :           SLP_INSTANCE_TREE (new_instance) = node;
    5009       672014 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5010       672014 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5011       672014 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5012       672014 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5013       672014 :           new_instance->reduc_phis = NULL;
    5014       672014 :           new_instance->cost_vec = vNULL;
    5015       672014 :           new_instance->subgraph_entries = vNULL;
    5016              : 
    5017       672014 :           if (dump_enabled_p ())
    5018         3128 :             dump_printf_loc (MSG_NOTE, vect_location,
    5019              :                              "SLP size %u vs. limit %u.\n",
    5020              :                              tree_size, max_tree_size);
    5021              : 
    5022       672014 :           vinfo->slp_instances.safe_push (new_instance);
    5023              : 
    5024              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5025              :              the number of scalar stmts in the root in a few places.
    5026              :              Verify that assumption holds.  */
    5027      1344028 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5028              :                         .length () == group_size);
    5029              : 
    5030       672014 :           if (dump_enabled_p ())
    5031              :             {
    5032         3128 :               dump_printf_loc (MSG_NOTE, vect_location,
    5033              :                                "Final SLP tree for instance %p:\n",
    5034              :                                (void *) new_instance);
    5035         3128 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5036              :                                     SLP_INSTANCE_TREE (new_instance));
    5037              :             }
    5038              : 
    5039       672014 :           return true;
    5040              :         }
    5041              :     }
    5042              :   /* Failed to SLP.  */
    5043              : 
    5044              :   /* Try to break the group up into pieces.  */
    5045       410221 :   if (*limit > 0 && kind == slp_inst_kind_store)
    5046              :     {
    5047              :       /* ???  We could delay all the actual splitting of store-groups
    5048              :          until after SLP discovery of the original group completed.
    5049              :          Then we can recurse to vect_build_slp_instance directly.  */
    5050      1073934 :       for (i = 0; i < group_size; i++)
    5051      1073934 :         if (!matches[i])
    5052              :           break;
    5053              : 
    5054              :       /* For basic block SLP, try to break the group up into multiples of
    5055              :          a vector size.  */
    5056       410220 :       if (is_a <bb_vec_info> (vinfo)
    5057       410220 :           && (i > 1 && i < group_size))
    5058              :         {
    5059              :           /* Free the allocated memory.  */
    5060       154084 :           scalar_stmts.release ();
    5061              : 
    5062       154084 :           tree scalar_type
    5063       154084 :             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    5064       308168 :           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    5065       154084 :                                                       1 << floor_log2 (i));
    5066       154084 :           unsigned HOST_WIDE_INT const_nunits;
    5067       154084 :           if (vectype
    5068       154084 :               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
    5069              :             {
    5070              :               /* Split into two groups at the first vector boundary.  */
    5071       154084 :               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
    5072       154084 :               unsigned group1_size = i & ~(const_nunits - 1);
    5073              : 
    5074       154084 :               if (dump_enabled_p ())
    5075           59 :                 dump_printf_loc (MSG_NOTE, vect_location,
    5076              :                                  "Splitting SLP group at stmt %u\n", i);
    5077       154084 :               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
    5078              :                                                                group1_size);
    5079       154084 :               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
    5080              :                                                     kind, max_tree_size,
    5081              :                                                     limit, false);
    5082              :               /* Split the rest at the failure point and possibly
    5083              :                  re-analyze the remaining matching part if it has
    5084              :                  at least two lanes.  */
    5085       154084 :               if (group1_size < i
    5086         5272 :                   && (i + 1 < group_size
    5087         2902 :                       || i - group1_size > 1))
    5088              :                 {
    5089         2402 :                   stmt_vec_info rest2 = rest;
    5090         2402 :                   rest = vect_split_slp_store_group (rest, i - group1_size);
    5091         2402 :                   if (i - group1_size > 1)
    5092           61 :                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
    5093              :                                                       kind, max_tree_size,
    5094              :                                                       limit, false);
    5095              :                 }
    5096              :               /* Re-analyze the non-matching tail if it has at least
    5097              :                  two lanes.  */
    5098       154084 :               if (i + 1 < group_size)
    5099        21780 :                 res |= vect_analyze_slp_instance (vinfo, bst_map,
    5100              :                                                   rest, kind, max_tree_size,
    5101              :                                                   limit, false);
    5102       154084 :               return res;
    5103              :             }
    5104              :         }
    5105              : 
    5106              :       /* For loop vectorization split the RHS into arbitrary pieces of
    5107              :          size >= 1.  */
    5108       256136 :       else if (is_a <loop_vec_info> (vinfo)
    5109       256136 :                && (group_size != 1 && i < group_size))
    5110              :         {
    5111         6434 :           gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
    5112           28 :           bool masked_p = call
    5113           28 :               && gimple_call_internal_p (call)
    5114           28 :               && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
    5115              :           /* There are targets that cannot do even/odd interleaving schemes
    5116              :              so they absolutely need to use load/store-lanes.  For now
    5117              :              force single-lane SLP for them - they would be happy with
    5118              :              uniform power-of-two lanes (but depending on element size),
    5119              :              but even if we can use 'i' as indicator we would need to
    5120              :              backtrack when later lanes fail to discover with the same
    5121              :              granularity.  We cannot turn any of strided or scatter store
    5122              :              into store-lanes.  */
    5123              :           /* ???  If this is not in sync with what get_load_store_type
    5124              :              later decides the SLP representation is not good for other
    5125              :              store vectorization methods.  */
    5126         6434 :           bool want_store_lanes
    5127         6434 :             = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5128         6434 :                && ! STMT_VINFO_STRIDED_P (stmt_info)
    5129         4893 :                && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5130         4889 :                && compare_step_with_zero (vinfo, stmt_info) > 0
    5131        11300 :                && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
    5132        12868 :                                                  masked_p, group_size, i));
    5133         6434 :           if (want_store_lanes || force_single_lane)
    5134              :             i = 1;
    5135              : 
    5136              :           /* A fatal discovery fail doesn't always mean single-lane SLP
    5137              :              isn't a possibility, so try.  */
    5138         4936 :           if (i == 0)
    5139              :             i = 1;
    5140              : 
    5141         6434 :           if (dump_enabled_p ())
    5142          882 :             dump_printf_loc (MSG_NOTE, vect_location,
    5143              :                              "Splitting SLP group at stmt %u\n", i);
    5144              : 
    5145              :           /* Analyze the stored values and pinch them together with
    5146              :              a permute node so we can preserve the whole store group.  */
    5147         6434 :           auto_vec<slp_tree> rhs_nodes;
    5148         6434 :           poly_uint64 max_nunits = 1;
    5149              : 
    5150         6434 :           unsigned int rhs_common_nlanes = 0;
    5151         6434 :           unsigned int start = 0, end = i;
    5152        29167 :           while (start < group_size)
    5153              :             {
    5154        22963 :               gcc_assert (end - start >= 1);
    5155        22963 :               vec<stmt_vec_info> substmts;
    5156        22963 :               substmts.create (end - start);
    5157        69463 :               for (unsigned j = start; j < end; ++j)
    5158        46500 :                 substmts.quick_push (scalar_stmts[j]);
    5159        22963 :               max_nunits = 1;
    5160        22963 :               node = vect_build_slp_tree (vinfo, substmts, end - start,
    5161              :                                           &max_nunits,
    5162              :                                           matches, limit, &tree_size, bst_map);
    5163        22963 :               if (node)
    5164              :                 {
    5165        18270 :                   rhs_nodes.safe_push (node);
    5166        18270 :                   vect_update_max_nunits (&max_nunits, node->max_nunits);
    5167        18270 :                   if (start == 0)
    5168         6208 :                     rhs_common_nlanes = SLP_TREE_LANES (node);
    5169        12062 :                   else if (rhs_common_nlanes != SLP_TREE_LANES (node))
    5170         1267 :                     rhs_common_nlanes = 0;
    5171        18270 :                   start = end;
    5172        18270 :                   if (want_store_lanes || force_single_lane)
    5173         4532 :                     end = start + 1;
    5174              :                   else
    5175              :                     end = group_size;
    5176              :                 }
    5177              :               else
    5178              :                 {
    5179         4693 :                   substmts.release ();
    5180         4693 :                   if (end - start == 1)
    5181              :                     {
    5182              :                       /* Single-lane discovery failed.  Free ressources.  */
    5183          244 :                       for (auto node : rhs_nodes)
    5184            6 :                         vect_free_slp_tree (node);
    5185          230 :                       scalar_stmts.release ();
    5186          230 :                       if (dump_enabled_p ())
    5187           38 :                         dump_printf_loc (MSG_NOTE, vect_location,
    5188              :                                          "SLP discovery failed\n");
    5189          230 :                       return false;
    5190              :                     }
    5191              : 
    5192              :                   /* ???  It really happens that we soft-fail SLP
    5193              :                      build at a mismatch but the matching part hard-fails
    5194              :                      later.  As we know we arrived here with a group
    5195              :                      larger than one try a group of size one!  */
    5196         4463 :                   if (!matches[0])
    5197           42 :                     end = start + 1;
    5198              :                   else
    5199         9934 :                     for (unsigned j = start; j < end; j++)
    5200         9934 :                       if (!matches[j - start])
    5201              :                         {
    5202              :                           end = j;
    5203              :                           break;
    5204              :                         }
    5205              :                 }
    5206              :             }
    5207              : 
    5208              :           /* Now re-assess whether we want store lanes in case the
    5209              :              discovery ended up producing all single-lane RHSs.  */
    5210         6204 :           if (! want_store_lanes
    5211         6204 :               && rhs_common_nlanes == 1
    5212         5339 :               && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5213         5339 :               && ! STMT_VINFO_STRIDED_P (stmt_info)
    5214         4052 :               && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5215         4049 :               && compare_step_with_zero (vinfo, stmt_info) > 0
    5216        10242 :               && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
    5217              :                                               group_size, masked_p)
    5218              :                   != IFN_LAST))
    5219              :             want_store_lanes = true;
    5220              : 
    5221              :           /* Now we assume we can build the root SLP node from all stores.  */
    5222         6204 :           if (want_store_lanes)
    5223              :             {
    5224              :               /* For store-lanes feed the store node with all RHS nodes
    5225              :                  in order.  */
    5226            0 :               node = vect_create_new_slp_node (scalar_stmts,
    5227            0 :                                                SLP_TREE_CHILDREN
    5228              :                                                  (rhs_nodes[0]).length ());
    5229            0 :               SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    5230            0 :               node->max_nunits = max_nunits;
    5231            0 :               node->ldst_lanes = true;
    5232            0 :               SLP_TREE_CHILDREN (node)
    5233            0 :                 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
    5234            0 :                                 + rhs_nodes.length () - 1);
    5235              :               /* First store value and possibly mask.  */
    5236            0 :               SLP_TREE_CHILDREN (node)
    5237            0 :                 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
    5238              :               /* Rest of the store values.  All mask nodes are the same,
    5239              :                  this should be guaranteed by dataref group discovery.  */
    5240            0 :               for (unsigned j = 1; j < rhs_nodes.length (); ++j)
    5241            0 :                 SLP_TREE_CHILDREN (node)
    5242            0 :                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
    5243            0 :               for (slp_tree child : SLP_TREE_CHILDREN (node))
    5244            0 :                 child->refcnt++;
    5245              :             }
    5246              :           else
    5247         6204 :             node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
    5248              :                                                       max_nunits);
    5249              : 
    5250        24468 :           while (!rhs_nodes.is_empty ())
    5251        18264 :             vect_free_slp_tree (rhs_nodes.pop ());
    5252              : 
    5253              :           /* Create a new SLP instance.  */
    5254         6204 :           slp_instance new_instance = XNEW (class _slp_instance);
    5255         6204 :           SLP_INSTANCE_TREE (new_instance) = node;
    5256         6204 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5257         6204 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5258         6204 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5259         6204 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5260         6204 :           new_instance->reduc_phis = NULL;
    5261         6204 :           new_instance->cost_vec = vNULL;
    5262         6204 :           new_instance->subgraph_entries = vNULL;
    5263              : 
    5264         6204 :           if (dump_enabled_p ())
    5265          844 :             dump_printf_loc (MSG_NOTE, vect_location,
    5266              :                              "SLP size %u vs. limit %u.\n",
    5267              :                              tree_size, max_tree_size);
    5268              : 
    5269         6204 :           vinfo->slp_instances.safe_push (new_instance);
    5270              : 
    5271              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5272              :              the number of scalar stmts in the root in a few places.
    5273              :              Verify that assumption holds.  */
    5274        12408 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5275              :                         .length () == group_size);
    5276              : 
    5277         6204 :           if (dump_enabled_p ())
    5278              :             {
    5279          844 :               dump_printf_loc (MSG_NOTE, vect_location,
    5280              :                                "Final SLP tree for instance %p:\n",
    5281              :                                (void *) new_instance);
    5282          844 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5283              :                                     SLP_INSTANCE_TREE (new_instance));
    5284              :             }
    5285         6204 :           return true;
    5286         6434 :         }
    5287              :       else
    5288              :         /* Free the allocated memory.  */
    5289       249702 :         scalar_stmts.release ();
    5290              : 
    5291              :       /* Even though the first vector did not all match, we might be able to SLP
    5292              :          (some) of the remainder.  FORNOW ignore this possibility.  */
    5293              :     }
    5294              :   else
    5295              :     /* Free the allocated memory.  */
    5296            1 :     scalar_stmts.release ();
    5297              : 
    5298              :   /* Failed to SLP.  */
    5299       249703 :   if (dump_enabled_p ())
    5300           42 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    5301              :   return false;
    5302              : }
    5303              : 
    5304              : /* qsort comparator ordering SLP load nodes.  */
    5305              : 
    5306              : static int
    5307      2235261 : vllp_cmp (const void *a_, const void *b_)
    5308              : {
    5309      2235261 :   const slp_tree a = *(const slp_tree *)a_;
    5310      2235261 :   const slp_tree b = *(const slp_tree *)b_;
    5311      2235261 :   stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
    5312      2235261 :   stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
    5313      2235261 :   if (STMT_VINFO_GROUPED_ACCESS (a0)
    5314      1366277 :       && STMT_VINFO_GROUPED_ACCESS (b0)
    5315      3541172 :       && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5316              :     {
    5317              :       /* Same group, order after lanes used.  */
    5318       296429 :       if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
    5319              :         return 1;
    5320       290398 :       else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
    5321              :         return -1;
    5322              :       else
    5323              :         {
    5324              :           /* Try to order loads using the same lanes together, breaking
    5325              :              the tie with the lane number that first differs.  */
    5326       283740 :           if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5327       283740 :               && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5328              :             return 0;
    5329       283740 :           else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5330       283740 :                    && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5331              :             return 1;
    5332       281146 :           else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5333       281146 :                    && SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5334              :             return -1;
    5335              :           else
    5336              :             {
    5337       276366 :               for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
    5338       276366 :                 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5339       276366 :                     != SLP_TREE_LOAD_PERMUTATION (b)[i])
    5340              :                   {
    5341              :                     /* In-order lane first, that's what the above case for
    5342              :                        no permutation does.  */
    5343       275534 :                     if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
    5344              :                       return -1;
    5345       167925 :                     else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
    5346              :                       return 1;
    5347        88830 :                     else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5348        88830 :                              < SLP_TREE_LOAD_PERMUTATION (b)[i])
    5349              :                       return -1;
    5350              :                     else
    5351              :                       return 1;
    5352              :                   }
    5353              :               return 0;
    5354              :             }
    5355              :         }
    5356              :     }
    5357              :   else /* Different groups or non-groups.  */
    5358              :     {
    5359              :       /* Order groups as their first element to keep them together.  */
    5360      1938832 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5361      1938832 :         a0 = DR_GROUP_FIRST_ELEMENT (a0);
    5362      1938832 :       if (STMT_VINFO_GROUPED_ACCESS (b0))
    5363      1938832 :         b0 = DR_GROUP_FIRST_ELEMENT (b0);
    5364      1938832 :       if (a0 == b0)
    5365              :         return 0;
    5366              :       /* Tie using UID.  */
    5367      1938712 :       else if (gimple_uid (STMT_VINFO_STMT (a0))
    5368      1938712 :                < gimple_uid (STMT_VINFO_STMT (b0)))
    5369              :         return -1;
    5370              :       else
    5371              :         {
    5372       853181 :           gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
    5373              :                       != gimple_uid (STMT_VINFO_STMT (b0)));
    5374              :           return 1;
    5375              :         }
    5376              :     }
    5377              : }
    5378              : 
    5379              : /* Return whether if the load permutation of NODE is consecutive starting
    5380              :    with value START_VAL in the first element.  If START_VAL is not given
    5381              :    the first element's value is used.  */
    5382              : 
    5383              : bool
    5384       544112 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
    5385              : {
    5386       544112 :   load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
    5387              : 
    5388       544112 :   if (!perm.exists () || !perm.length ())
    5389              :     return false;
    5390              : 
    5391       544112 :   if (start_val == UINT_MAX)
    5392        73984 :     start_val = perm[0];
    5393              : 
    5394      1075901 :   for (unsigned int i = 0; i < perm.length (); i++)
    5395       549905 :     if (perm[i] != start_val + (unsigned int) i)
    5396              :       return false;
    5397              : 
    5398              :   return true;
    5399              : }
    5400              : 
    5401              : /* Process the set of LOADS that are all from the same dataref group.  */
    5402              : 
    5403              : static void
    5404       150948 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5405              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5406              :                               const array_slice<slp_tree> &loads,
    5407              :                               bool force_single_lane)
    5408              : {
    5409              :   /* We at this point want to lower without a fixed VF or vector
    5410              :      size in mind which means we cannot actually compute whether we
    5411              :      need three or more vectors for a load permutation yet.  So always
    5412              :      lower.  */
    5413       150948 :   stmt_vec_info first
    5414       150948 :     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
    5415       150948 :   unsigned group_lanes = DR_GROUP_SIZE (first);
    5416              : 
    5417              :   /* Verify if all load permutations can be implemented with a suitably
    5418              :      large element load-lanes operation.  */
    5419       150948 :   unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
    5420       150948 :   if (STMT_VINFO_STRIDED_P (first)
    5421       148834 :       || compare_step_with_zero (loop_vinfo, first) <= 0
    5422       146503 :       || exact_log2 (ld_lanes_lanes) == -1
    5423              :       /* ???  For now only support the single-lane case as there is
    5424              :          missing support on the store-lane side and code generation
    5425              :          isn't up to the task yet.  */
    5426       144393 :       || ld_lanes_lanes != 1
    5427       288206 :       || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
    5428              :                                     group_lanes / ld_lanes_lanes,
    5429              :                                     false) == IFN_LAST)
    5430              :     ld_lanes_lanes = 0;
    5431              :   else
    5432              :     /* Verify the loads access the same number of lanes aligned to
    5433              :        ld_lanes_lanes.  */
    5434            0 :     for (slp_tree load : loads)
    5435              :       {
    5436            0 :         if (SLP_TREE_LANES (load) != ld_lanes_lanes)
    5437              :           {
    5438              :             ld_lanes_lanes = 0;
    5439              :             break;
    5440              :           }
    5441            0 :         unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
    5442            0 :         if (first % ld_lanes_lanes != 0)
    5443              :           {
    5444              :             ld_lanes_lanes = 0;
    5445              :             break;
    5446              :           }
    5447            0 :         if (!vect_load_perm_consecutive_p (load))
    5448              :           {
    5449              :             ld_lanes_lanes = 0;
    5450              :             break;
    5451              :           }
    5452              :       }
    5453              : 
    5454              :   /* Only a power-of-two number of lanes matches interleaving with N levels.
    5455              :      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
    5456              :      at each step.  */
    5457       248439 :   if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
    5458              :     return;
    5459              : 
    5460       237971 :   for (slp_tree load : loads)
    5461              :     {
    5462              :       /* Leave masked or gather loads alone for now.  */
    5463       168825 :       if (!SLP_TREE_CHILDREN (load).is_empty ())
    5464        48193 :         continue;
    5465              : 
    5466              :       /* For single-element interleaving spanning multiple vectors avoid
    5467              :          lowering, we want to use VMAT_ELEMENTWISE later.  */
    5468       168819 :       if (ld_lanes_lanes == 0
    5469       168819 :           && SLP_TREE_LANES (load) == 1
    5470       155680 :           && !DR_GROUP_NEXT_ELEMENT (first)
    5471       247018 :           && maybe_gt (group_lanes,
    5472              :                        TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
    5473        50380 :         return;
    5474              : 
    5475              :       /* We want to pattern-match special cases here and keep those
    5476              :          alone.  Candidates are splats and load-lane.  */
    5477              : 
    5478              :       /* We need to lower only loads of less than half of the groups
    5479              :          lanes, including duplicate lanes.  Note this leaves nodes
    5480              :          with a non-1:1 load permutation around instead of canonicalizing
    5481              :          those into a load and a permute node.  Removing this early
    5482              :          check would do such canonicalization.  */
    5483       118439 :       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
    5484        44809 :           && ld_lanes_lanes == 0)
    5485        44809 :         continue;
    5486              : 
    5487              :       /* Build the permute to get the original load permutation order.  */
    5488        73630 :       bool contiguous = vect_load_perm_consecutive_p (load);
    5489        73630 :       lane_permutation_t final_perm;
    5490        73630 :       final_perm.create (SLP_TREE_LANES (load));
    5491       147918 :       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
    5492       148576 :         final_perm.quick_push (
    5493        74288 :           std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
    5494              : 
    5495              :       /* When the load permutation accesses a contiguous unpermuted,
    5496              :          power-of-two aligned and sized chunk leave the load alone.
    5497              :          We can likely (re-)load it more efficiently rather than
    5498              :          extracting it from the larger load.
    5499              :          ???  Long-term some of the lowering should move to where
    5500              :          the vector types involved are fixed.  */
    5501        77008 :       if (!force_single_lane
    5502        73630 :           && ld_lanes_lanes == 0
    5503        48904 :           && contiguous
    5504        48667 :           && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
    5505         6373 :           && pow2p_hwi (SLP_TREE_LANES (load))
    5506         6337 :           && pow2p_hwi (group_lanes)
    5507         3378 :           && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
    5508        77008 :           && group_lanes % SLP_TREE_LANES (load) == 0)
    5509              :         {
    5510         3378 :           final_perm.release ();
    5511         3378 :           continue;
    5512              :         }
    5513              : 
    5514              :       /* First build (and possibly re-use) a load node for the
    5515              :          unpermuted group.  Gaps in the middle and on the end are
    5516              :          represented with NULL stmts.  */
    5517        70252 :       vec<stmt_vec_info> stmts;
    5518        70252 :       stmts.create (group_lanes);
    5519       245273 :       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
    5520              :         {
    5521       175021 :           if (s != first)
    5522       108852 :             for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
    5523         4083 :               stmts.quick_push (NULL);
    5524       175021 :           stmts.quick_push (s);
    5525              :         }
    5526       131478 :       for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
    5527        61226 :         stmts.quick_push (NULL);
    5528        70252 :       poly_uint64 max_nunits = 1;
    5529        70252 :       bool *matches = XALLOCAVEC (bool, group_lanes);
    5530        70252 :       unsigned limit = 1;
    5531        70252 :       unsigned tree_size = 0;
    5532        70252 :       slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
    5533              :                                          group_lanes,
    5534              :                                          &max_nunits, matches, &limit,
    5535        70252 :                                          &tree_size, bst_map);
    5536        70252 :       gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
    5537              : 
    5538        70252 :       if (ld_lanes_lanes != 0)
    5539              :         {
    5540              :           /* ???  If this is not in sync with what get_load_store_type
    5541              :              later decides the SLP representation is not good for other
    5542              :              store vectorization methods.  */
    5543            0 :           l0->ldst_lanes = true;
    5544            0 :           load->ldst_lanes = true;
    5545              :         }
    5546              : 
    5547       217422 :       while (1)
    5548              :         {
    5549       143837 :           unsigned group_lanes = SLP_TREE_LANES (l0);
    5550       143837 :           if (ld_lanes_lanes != 0
    5551       143837 :               || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
    5552              :             break;
    5553              : 
    5554              :           /* Try to lower by reducing the group to half its size using an
    5555              :              interleaving scheme.  For this try to compute whether all
    5556              :              elements needed for this load are in even or odd elements of
    5557              :              an even/odd decomposition with N consecutive elements.
    5558              :              Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
    5559              :              with N == 2.  */
    5560              :           /* ???  Only an even number of lanes can be handed this way, but the
    5561              :              fallback below could work for any number.  We have to make sure
    5562              :              to round up in that case.  */
    5563        73585 :           gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
    5564         9807 :           unsigned even = 0, odd = 0;
    5565         9807 :           if ((group_lanes & 1) == 0)
    5566              :             {
    5567         9807 :               even = (1 << ceil_log2 (group_lanes)) - 1;
    5568         9807 :               odd = even;
    5569        39899 :               for (auto l : final_perm)
    5570              :                 {
    5571        10478 :                   even &= ~l.second;
    5572        10478 :                   odd &= l.second;
    5573              :                 }
    5574              :             }
    5575              : 
    5576              :           /* Now build an even or odd extraction from the unpermuted load.  */
    5577        73585 :           lane_permutation_t perm;
    5578        73585 :           perm.create ((group_lanes + 1) / 2);
    5579        73585 :           unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
    5580        73585 :           unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
    5581        73585 :           if (even_level
    5582         9051 :               && group_lanes % (2 * even_level) == 0
    5583              :               /* ???  When code generating permutes we do not try to pun
    5584              :                  to larger component modes so level != 1 isn't a natural
    5585              :                  even/odd extract.  Prefer one if possible.  */
    5586         9051 :               && (even_level == 1 || !odd_level || odd_level != 1))
    5587              :             {
    5588              :               /* { 0, 1, ... 4, 5 ..., } */
    5589        33232 :               for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
    5590        52520 :                 for (unsigned j = 0; j < even_level; ++j)
    5591        26430 :                   perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
    5592              :             }
    5593        64534 :           else if (odd_level)
    5594              :             {
    5595              :               /* { ..., 2, 3, ... 6, 7 } */
    5596         2635 :               gcc_assert (group_lanes % (2 * odd_level) == 0);
    5597        11413 :               for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
    5598        17610 :                 for (unsigned j = 0; j < odd_level; ++j)
    5599         8832 :                   perm.quick_push
    5600         8832 :                     (std::make_pair (0, (2 * i + 1) * odd_level + j));
    5601              :             }
    5602              :           else
    5603              :             {
    5604              :               /* As fallback extract all used lanes and fill to half the
    5605              :                  group size by repeating the last element.
    5606              :                  ???  This is quite a bad strathegy for re-use - we could
    5607              :                  brute force our way to find more optimal filling lanes to
    5608              :                  maximize re-use when looking at all loads from the group.  */
    5609        63808 :               auto_bitmap l;
    5610       255288 :               for (auto p : final_perm)
    5611        63864 :                 bitmap_set_bit (l, p.second);
    5612        63808 :               unsigned i = 0;
    5613        63808 :               bitmap_iterator bi;
    5614       127672 :               EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
    5615        63864 :                   perm.quick_push (std::make_pair (0, i));
    5616       255384 :               while (perm.length () < (group_lanes + 1) / 2)
    5617        63884 :                 perm.quick_push (perm.last ());
    5618        63808 :             }
    5619              : 
    5620              :           /* Update final_perm with the intermediate permute.  */
    5621       147841 :           for (unsigned i = 0; i < final_perm.length (); ++i)
    5622              :             {
    5623        74256 :               unsigned l = final_perm[i].second;
    5624        74256 :               unsigned j;
    5625        81456 :               for (j = 0; j < perm.length (); ++j)
    5626        81456 :                 if (perm[j].second == l)
    5627              :                   {
    5628        74256 :                     final_perm[i].second = j;
    5629        74256 :                     break;
    5630              :                   }
    5631        74256 :               gcc_assert (j < perm.length ());
    5632              :             }
    5633              : 
    5634              :           /* And create scalar stmts.  */
    5635        73585 :           vec<stmt_vec_info> perm_stmts;
    5636        73585 :           perm_stmts.create (perm.length ());
    5637       236595 :           for (unsigned i = 0; i < perm.length (); ++i)
    5638       163010 :             perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
    5639              : 
    5640        73585 :           slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    5641        73585 :           SLP_TREE_CHILDREN (p).quick_push (l0);
    5642        73585 :           SLP_TREE_LANE_PERMUTATION (p) = perm;
    5643        73585 :           SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
    5644        73585 :           SLP_TREE_LANES (p) = perm.length ();
    5645        73585 :           SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
    5646              :           /* ???  As we have scalar stmts for this intermediate permute we
    5647              :              could CSE it via bst_map but we do not want to pick up
    5648              :              another SLP node with a load permutation.  We instead should
    5649              :              have a "local" CSE map here.  */
    5650        73585 :           SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
    5651              : 
    5652              :           /* We now have a node for (group_lanes + 1) / 2 lanes.  */
    5653        73585 :           l0 = p;
    5654        73585 :         }
    5655              : 
    5656              :       /* And finally from the ordered reduction node create the
    5657              :          permute to shuffle the lanes into the original load-permutation
    5658              :          order.  We replace the original load node with this.  */
    5659        70252 :       SLP_TREE_CODE (load) = VEC_PERM_EXPR;
    5660        70252 :       SLP_TREE_LOAD_PERMUTATION (load).release ();
    5661        70252 :       SLP_TREE_LANE_PERMUTATION (load) = final_perm;
    5662        70252 :       SLP_TREE_CHILDREN (load).create (1);
    5663        70252 :       SLP_TREE_CHILDREN (load).quick_push (l0);
    5664              :     }
    5665              : }
    5666              : 
    5667              : /* Transform SLP loads in the SLP graph created by SLP discovery to
    5668              :    group loads from the same group and lower load permutations that
    5669              :    are unlikely to be supported into a series of permutes.
    5670              :    In the degenerate case of having only single-lane SLP instances
    5671              :    this should result in a series of permute nodes emulating an
    5672              :    interleaving scheme.  */
    5673              : 
    5674              : static void
    5675       405174 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5676              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5677              :                               bool force_single_lane)
    5678              : {
    5679              :   /* Gather and sort loads across all instances.  */
    5680       405174 :   hash_set<slp_tree> visited;
    5681       405174 :   auto_vec<slp_tree> loads;
    5682      1877416 :   for (auto inst : loop_vinfo->slp_instances)
    5683       663804 :     vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
    5684       405174 :   if (loads.is_empty ())
    5685        70841 :     return;
    5686       334333 :   loads.qsort (vllp_cmp);
    5687              : 
    5688              :   /* Now process each dataref group separately.  */
    5689       334333 :   unsigned firsti = 0;
    5690       621571 :   for (unsigned i = 1; i < loads.length (); ++i)
    5691              :     {
    5692       287238 :       slp_tree first = loads[firsti];
    5693       287238 :       slp_tree next = loads[i];
    5694       287238 :       stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
    5695       287238 :       stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
    5696       287238 :       if (STMT_VINFO_GROUPED_ACCESS (a0)
    5697       144831 :           && STMT_VINFO_GROUPED_ACCESS (b0)
    5698       419114 :           && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5699        54256 :         continue;
    5700              :       /* Now we have one or multiple SLP loads of the same group from
    5701              :          firsti to i - 1.  */
    5702       232982 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5703        90575 :         vect_lower_load_permutations (loop_vinfo, bst_map,
    5704        90575 :                                       make_array_slice (&loads[firsti],
    5705              :                                                         i - firsti),
    5706              :                                       force_single_lane);
    5707              :       firsti = i;
    5708              :     }
    5709       668666 :   if (firsti < loads.length ()
    5710       668666 :       && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
    5711        60373 :     vect_lower_load_permutations (loop_vinfo, bst_map,
    5712        60373 :                                   make_array_slice (&loads[firsti],
    5713        60373 :                                                     loads.length () - firsti),
    5714              :                                   force_single_lane);
    5715       405174 : }
    5716              : 
    5717              : /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
    5718              :    trees of packed scalar stmts if SLP is possible.  */
    5719              : 
    5720              : opt_result
    5721      1035590 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
    5722              :                   bool force_single_lane)
    5723              : {
    5724      1035590 :   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
    5725      1035590 :   unsigned int i;
    5726      1035590 :   stmt_vec_info first_element;
    5727      1035590 :   slp_instance instance;
    5728              : 
    5729      1035590 :   DUMP_VECT_SCOPE ("vect_analyze_slp");
    5730              : 
    5731      1035590 :   unsigned limit = max_tree_size;
    5732              : 
    5733      1035590 :   scalar_stmts_to_slp_tree_map_t *bst_map
    5734      1035590 :     = new scalar_stmts_to_slp_tree_map_t ();
    5735              : 
    5736              :   /* Find SLP sequences starting from groups of grouped stores.  */
    5737      2977275 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
    5738       906333 :     if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
    5739              :                                      slp_inst_kind_store, max_tree_size, &limit,
    5740              :                                      force_single_lane)
    5741       906333 :         && loop_vinfo)
    5742          238 :       return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5743              : 
    5744              :   /* For loops also start SLP discovery from non-grouped stores.  */
    5745      1035352 :   if (loop_vinfo)
    5746              :     {
    5747              :       data_reference_p dr;
    5748      1369971 :       FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
    5749       947207 :         if (DR_IS_WRITE (dr))
    5750              :           {
    5751       286926 :             stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
    5752              :             /* Grouped stores are already handled above.  */
    5753       286926 :             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    5754        76070 :               continue;
    5755       210856 :             vec<stmt_vec_info> stmts;
    5756       210856 :             vec<stmt_vec_info> roots = vNULL;
    5757       210856 :             vec<tree> remain = vNULL;
    5758       210856 :             stmts.create (1);
    5759       210856 :             stmts.quick_push (stmt_info);
    5760       210856 :             if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5761              :                                            stmts, roots, remain, max_tree_size,
    5762              :                                            &limit, bst_map, force_single_lane))
    5763         3585 :               return opt_result::failure_at (vect_location,
    5764              :                                              "SLP build failed.\n");
    5765              :           }
    5766              : 
    5767              :       stmt_vec_info stmt_info;
    5768       422804 :       FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
    5769              :         {
    5770           20 :           vec<stmt_vec_info> stmts;
    5771           20 :           vec<stmt_vec_info> roots = vNULL;
    5772           20 :           vec<tree> remain = vNULL;
    5773           20 :           stmts.create (1);
    5774           20 :           stmts.quick_push (stmt_info);
    5775           20 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5776              :                                          stmts, roots, remain, max_tree_size,
    5777              :                                          &limit, bst_map, force_single_lane))
    5778            0 :             return opt_result::failure_at (vect_location,
    5779              :                                            "SLP build failed.\n");
    5780              :         }
    5781              :     }
    5782              : 
    5783      1031767 :   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
    5784              :     {
    5785      1809092 :       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
    5786              :         {
    5787      1200089 :           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
    5788              :           /* Apply patterns.  */
    5789      3752061 :           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
    5790      5103944 :             bb_vinfo->roots[i].stmts[j]
    5791      2625048 :               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
    5792      1200089 :           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
    5793      1200089 :                                        bb_vinfo->roots[i].stmts,
    5794      1200089 :                                        bb_vinfo->roots[i].roots,
    5795      1200089 :                                        bb_vinfo->roots[i].remain,
    5796              :                                        max_tree_size, &limit, bst_map, false))
    5797              :             {
    5798       126893 :               bb_vinfo->roots[i].roots = vNULL;
    5799       126893 :               bb_vinfo->roots[i].remain = vNULL;
    5800              :             }
    5801      1200089 :           bb_vinfo->roots[i].stmts = vNULL;
    5802              :         }
    5803              :     }
    5804              : 
    5805      1031767 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    5806              :     {
    5807              :       /* Find SLP sequences starting from groups of reductions.  */
    5808       422764 :       if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
    5809              :                                         bst_map, force_single_lane))
    5810         2585 :         return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5811              : 
    5812              :       /* Make sure to vectorize only-live stmts, usually inductions.  */
    5813      1923343 :       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
    5814      1270335 :         for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
    5815       597939 :              gsi_next (&gsi))
    5816              :           {
    5817       607529 :             gphi *lc_phi = *gsi;
    5818       607529 :             tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
    5819       607529 :             stmt_vec_info stmt_info;
    5820       607529 :             if (TREE_CODE (def) == SSA_NAME
    5821       496475 :                 && !virtual_operand_p (def)
    5822       268934 :                 && (stmt_info = loop_vinfo->lookup_def (def))
    5823       238256 :                 && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
    5824       238256 :                 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
    5825       178573 :                 && STMT_VINFO_LIVE_P (stmt_info)
    5826       178573 :                 && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
    5827       714396 :                 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    5828              :               {
    5829       106788 :                 vec<stmt_vec_info> stmts;
    5830       106788 :                 vec<stmt_vec_info> roots = vNULL;
    5831       106788 :                 vec<tree> remain = vNULL;
    5832       106788 :                 stmts.create (1);
    5833       106788 :                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
    5834       106788 :                 if (! vect_build_slp_instance (vinfo,
    5835              :                                                slp_inst_kind_reduc_group,
    5836              :                                                stmts, roots, remain,
    5837              :                                                max_tree_size, &limit,
    5838              :                                                bst_map, force_single_lane))
    5839         9590 :                   return opt_result::failure_at (vect_location,
    5840              :                                                  "SLP build failed.\n");
    5841              :               }
    5842         9590 :           }
    5843              : 
    5844              :       /* Find SLP sequences starting from gconds.  */
    5845      1108487 :       for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
    5846              :         {
    5847       274535 :           auto cond_info = loop_vinfo->lookup_stmt (cond);
    5848              : 
    5849       274535 :           cond_info = vect_stmt_to_vectorize (cond_info);
    5850       274535 :           vec<stmt_vec_info> roots = vNULL;
    5851       274535 :           roots.safe_push (cond_info);
    5852       274535 :           gimple *stmt = STMT_VINFO_STMT (cond_info);
    5853       274535 :           tree args0 = gimple_cond_lhs (stmt);
    5854       274535 :           tree args1 = gimple_cond_rhs (stmt);
    5855              : 
    5856              :           /* These should be enforced by cond lowering, but if it failed
    5857              :              bail.  */
    5858       274535 :           if (gimple_cond_code (stmt) != NE_EXPR
    5859       273457 :               || TREE_TYPE (args0) != boolean_type_node
    5860       547345 :               || !integer_zerop (args1))
    5861              :             {
    5862         1725 :               roots.release ();
    5863         1725 :               return opt_result::failure_at (vect_location,
    5864              :                                              "SLP build failed.\n");
    5865              :             }
    5866              : 
    5867              :           /* An argument without a loop def will be codegened from vectorizing the
    5868              :              root gcond itself.  As such we don't need to try to build an SLP tree
    5869              :              from them.  It's highly likely that the resulting SLP tree here if both
    5870              :              arguments have a def will be incompatible, but we rely on it being split
    5871              :              later on.  */
    5872       272810 :           auto varg = loop_vinfo->lookup_def (args0);
    5873       272810 :           vec<stmt_vec_info> stmts;
    5874       272810 :           vec<tree> remain = vNULL;
    5875       272810 :           stmts.create (1);
    5876       272810 :           stmts.quick_push (vect_stmt_to_vectorize (varg));
    5877              : 
    5878       272810 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
    5879              :                                          stmts, roots, remain,
    5880              :                                          max_tree_size, &limit,
    5881              :                                          bst_map, force_single_lane))
    5882              :             {
    5883         3690 :               roots.release ();
    5884         3690 :               return opt_result::failure_at (vect_location,
    5885              :                                              "SLP build failed.\n");
    5886              :             }
    5887              :         }
    5888              :     }
    5889              : 
    5890      1014177 :   hash_set<slp_tree> visited_patterns;
    5891      1014177 :   slp_tree_to_load_perm_map_t perm_cache;
    5892      1014177 :   slp_compat_nodes_map_t compat_cache;
    5893              : 
    5894              :   /* See if any patterns can be found in the SLP tree.  */
    5895      1014177 :   bool pattern_found = false;
    5896      3473791 :   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    5897      1445437 :     pattern_found |= vect_match_slp_patterns (instance, vinfo,
    5898              :                                               &visited_patterns, &perm_cache,
    5899              :                                               &compat_cache);
    5900              : 
    5901              :   /* If any were found optimize permutations of loads.  */
    5902      1014177 :   if (pattern_found)
    5903              :     {
    5904          202 :       hash_map<slp_tree, slp_tree> load_map;
    5905         3239 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    5906              :         {
    5907         2835 :           slp_tree root = SLP_INSTANCE_TREE (instance);
    5908         2835 :           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
    5909              :                                         &load_map, root);
    5910              :         }
    5911          202 :     }
    5912              : 
    5913              :   /* Check whether we should force some SLP instances to use load/store-lanes
    5914              :      and do so by forcing SLP re-discovery with single lanes.  We used
    5915              :      to cancel SLP when this applied to all instances in a loop but now
    5916              :      we decide this per SLP instance.  It's important to do this only
    5917              :      after SLP pattern recognition.  */
    5918      1014177 :   if (is_a <loop_vec_info> (vinfo))
    5919      1068978 :     FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    5920       663804 :       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
    5921       229031 :           && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
    5922              :         {
    5923       229031 :           slp_tree slp_root = SLP_INSTANCE_TREE (instance);
    5924       229031 :           unsigned int group_size = SLP_TREE_LANES (slp_root);
    5925       229031 :           tree vectype = SLP_TREE_VECTYPE (slp_root);
    5926              : 
    5927       229031 :           stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
    5928       229031 :           gimple *rep = STMT_VINFO_STMT (rep_info);
    5929       229031 :           bool masked = (is_gimple_call (rep)
    5930         1366 :                          && gimple_call_internal_p (rep)
    5931       230377 :                          && internal_fn_mask_index
    5932         1346 :                               (gimple_call_internal_fn (rep)) != -1);
    5933       229011 :           if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
    5934        23454 :               || slp_root->ldst_lanes
    5935       252485 :               || (vect_store_lanes_supported (vectype, group_size, masked)
    5936              :                   == IFN_LAST))
    5937       229031 :             continue;
    5938              : 
    5939            0 :           auto_vec<slp_tree> loads;
    5940            0 :           hash_set<slp_tree> visited;
    5941            0 :           vect_gather_slp_loads (loads, slp_root, visited);
    5942              : 
    5943              :           /* Check whether any load in the SLP instance is possibly
    5944              :              permuted.  */
    5945            0 :           bool loads_permuted = false;
    5946            0 :           slp_tree load_node;
    5947            0 :           unsigned j;
    5948            0 :           FOR_EACH_VEC_ELT (loads, j, load_node)
    5949              :             {
    5950            0 :               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
    5951            0 :                 continue;
    5952              :               unsigned k;
    5953              :               stmt_vec_info load_info;
    5954            0 :               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
    5955            0 :                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
    5956              :                   {
    5957              :                     loads_permuted = true;
    5958              :                     break;
    5959              :                   }
    5960              :             }
    5961              : 
    5962              :           /* If the loads and stores can use load/store-lanes force re-discovery
    5963              :              with single lanes.  */
    5964            0 :           if (loads_permuted)
    5965              :             {
    5966            0 :               bool can_use_lanes = true;
    5967              :               bool prefer_load_lanes = false;
    5968            0 :               FOR_EACH_VEC_ELT (loads, j, load_node)
    5969            0 :                 if (STMT_VINFO_GROUPED_ACCESS
    5970              :                       (SLP_TREE_REPRESENTATIVE (load_node)))
    5971              :                   {
    5972            0 :                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
    5973              :                         (SLP_TREE_REPRESENTATIVE (load_node));
    5974            0 :                     rep = STMT_VINFO_STMT (stmt_vinfo);
    5975            0 :                     masked = (is_gimple_call (rep)
    5976            0 :                               && gimple_call_internal_p (rep)
    5977            0 :                               && internal_fn_mask_index
    5978            0 :                                    (gimple_call_internal_fn (rep)));
    5979              :                     /* Use SLP for strided accesses (or if we can't
    5980              :                        load-lanes).  */
    5981            0 :                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
    5982            0 :                         || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
    5983            0 :                         || vect_load_lanes_supported
    5984            0 :                              (SLP_TREE_VECTYPE (load_node),
    5985            0 :                               DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
    5986              :                         /* ???  During SLP re-discovery with a single lane
    5987              :                            a masked grouped load will appear permuted and
    5988              :                            discovery will fail.  We have to rework this
    5989              :                            on the discovery side - for now avoid ICEing.  */
    5990            0 :                         || masked)
    5991              :                       {
    5992              :                         can_use_lanes = false;
    5993              :                         break;
    5994              :                       }
    5995              :                     /* Make sure that the target would prefer store-lanes
    5996              :                        for at least one of the loads.
    5997              : 
    5998              :                        ??? Perhaps we should instead require this for
    5999              :                        all loads?  */
    6000            0 :                     prefer_load_lanes
    6001              :                       = (prefer_load_lanes
    6002            0 :                          || SLP_TREE_LANES (load_node) == group_size
    6003            0 :                          || (vect_slp_prefer_store_lanes_p
    6004            0 :                              (vinfo, stmt_vinfo,
    6005              :                               SLP_TREE_VECTYPE (load_node), masked,
    6006              :                               group_size, SLP_TREE_LANES (load_node))));
    6007              :                   }
    6008              : 
    6009            0 :               if (can_use_lanes && prefer_load_lanes)
    6010              :                 {
    6011            0 :                   if (dump_enabled_p ())
    6012            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    6013              :                                      "SLP instance %p can use load/store-lanes,"
    6014              :                                      " re-discovering with single-lanes\n",
    6015              :                                      (void *) instance);
    6016              : 
    6017            0 :                   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
    6018              : 
    6019            0 :                   vect_free_slp_instance (instance);
    6020            0 :                   limit = max_tree_size;
    6021            0 :                   bool res = vect_analyze_slp_instance (vinfo, bst_map,
    6022              :                                                         stmt_info,
    6023              :                                                         slp_inst_kind_store,
    6024              :                                                         max_tree_size, &limit,
    6025              :                                                         true);
    6026            0 :                   gcc_assert (res);
    6027            0 :                   auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
    6028            0 :                   LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
    6029              :                 }
    6030              :             }
    6031            0 :         }
    6032              : 
    6033              :   /* When we end up with load permutations that we cannot possibly handle,
    6034              :      like those requiring three vector inputs, lower them using interleaving
    6035              :      like schemes.  */
    6036      1014177 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    6037              :     {
    6038       405174 :       vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
    6039       405174 :       if (dump_enabled_p ())
    6040              :         {
    6041        19216 :           dump_printf_loc (MSG_NOTE, vect_location,
    6042              :                            "SLP graph after lowering permutations:\n");
    6043        19216 :           hash_set<slp_tree> visited;
    6044        85782 :           FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6045        28159 :             vect_print_slp_graph (MSG_NOTE, vect_location,
    6046              :                                   SLP_INSTANCE_TREE (instance), visited);
    6047        19216 :         }
    6048              :     }
    6049              : 
    6050      1014177 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    6051              : 
    6052      1014177 :   if (pattern_found && dump_enabled_p ())
    6053              :     {
    6054           14 :       dump_printf_loc (MSG_NOTE, vect_location,
    6055              :                        "Pattern matched SLP tree\n");
    6056           14 :       hash_set<slp_tree> visited;
    6057           74 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6058           32 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    6059              :                               SLP_INSTANCE_TREE (instance), visited);
    6060           14 :     }
    6061              : 
    6062      1014177 :   return opt_result::success ();
    6063      1014177 : }
    6064              : 
    6065              : /* Estimates the cost of inserting layout changes into the SLP graph.
    6066              :    It can also say that the insertion is impossible.  */
    6067              : 
    6068              : struct slpg_layout_cost
    6069              : {
    6070      9608829 :   slpg_layout_cost () = default;
    6071              :   slpg_layout_cost (sreal, bool);
    6072              : 
    6073       448284 :   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
    6074      4949256 :   bool is_possible () const { return depth != sreal::max (); }
    6075              : 
    6076              :   bool operator== (const slpg_layout_cost &) const;
    6077              :   bool operator!= (const slpg_layout_cost &) const;
    6078              : 
    6079              :   bool is_better_than (const slpg_layout_cost &, bool) const;
    6080              : 
    6081              :   void add_parallel_cost (const slpg_layout_cost &);
    6082              :   void add_serial_cost (const slpg_layout_cost &);
    6083              :   void split (unsigned int);
    6084              : 
    6085              :   /* The longest sequence of layout changes needed during any traversal
    6086              :      of the partition dag, weighted by execution frequency.
    6087              : 
    6088              :      This is the most important metric when optimizing for speed, since
    6089              :      it helps to ensure that we keep the number of operations on
    6090              :      critical paths to a minimum.  */
    6091              :   sreal depth = 0;
    6092              : 
    6093              :   /* An estimate of the total number of operations needed.  It is weighted by
    6094              :      execution frequency when optimizing for speed but not when optimizing for
    6095              :      size.  In order to avoid double-counting, a node with a fanout of N will
    6096              :      distribute 1/N of its total cost to each successor.
    6097              : 
    6098              :      This is the most important metric when optimizing for size, since
    6099              :      it helps to keep the total number of operations to a minimum,  */
    6100              :   sreal total = 0;
    6101              : };
    6102              : 
    6103              : /* Construct costs for a node with weight WEIGHT.  A higher weight
    6104              :    indicates more frequent execution.  IS_FOR_SIZE is true if we are
    6105              :    optimizing for size rather than speed.  */
    6106              : 
    6107      1163084 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
    6108      1163952 :   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
    6109              : {
    6110      1163084 : }
    6111              : 
    6112              : bool
    6113            0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
    6114              : {
    6115            0 :   return depth == other.depth && total == other.total;
    6116              : }
    6117              : 
    6118              : bool
    6119            0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
    6120              : {
    6121            0 :   return !operator== (other);
    6122              : }
    6123              : 
    6124              : /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
    6125              :    true if we are optimizing for size rather than speed.  */
    6126              : 
    6127              : bool
    6128       291381 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
    6129              :                                   bool is_for_size) const
    6130              : {
    6131       291381 :   if (is_for_size)
    6132              :     {
    6133          382 :       if (total != other.total)
    6134          159 :         return total < other.total;
    6135          223 :       return depth < other.depth;
    6136              :     }
    6137              :   else
    6138              :     {
    6139       290999 :       if (depth != other.depth)
    6140       124583 :         return depth < other.depth;
    6141       166416 :       return total < other.total;
    6142              :     }
    6143              : }
    6144              : 
    6145              : /* Increase the costs to account for something with cost INPUT_COST
    6146              :    happening in parallel with the current costs.  */
    6147              : 
    6148              : void
    6149       344137 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
    6150              : {
    6151       344137 :   depth = std::max (depth, input_cost.depth);
    6152       344137 :   total += input_cost.total;
    6153       344137 : }
    6154              : 
    6155              : /* Increase the costs to account for something with cost INPUT_COST
    6156              :    happening in series with the current costs.  */
    6157              : 
    6158              : void
    6159      1401731 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
    6160              : {
    6161      1401731 :   depth += other.depth;
    6162      1401731 :   total += other.total;
    6163      1401731 : }
    6164              : 
    6165              : /* Split the total cost among TIMES successors or predecessors.  */
    6166              : 
    6167              : void
    6168      1155497 : slpg_layout_cost::split (unsigned int times)
    6169              : {
    6170      1155497 :   if (times > 1)
    6171       483023 :     total /= times;
    6172      1155497 : }
    6173              : 
    6174              : /* Information about one node in the SLP graph, for use during
    6175              :    vect_optimize_slp_pass.  */
    6176              : 
    6177              : struct slpg_vertex
    6178              : {
    6179      9083528 :   slpg_vertex (slp_tree node_) : node (node_) {}
    6180              : 
    6181              :   /* The node itself.  */
    6182              :   slp_tree node;
    6183              : 
    6184              :   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
    6185              :      partitions are flexible; they can have whichever layout consumers
    6186              :      want them to have.  */
    6187              :   int partition = -1;
    6188              : 
    6189              :   /* The number of nodes that directly use the result of this one
    6190              :      (i.e. the number of nodes that count this one as a child).  */
    6191              :   unsigned int out_degree = 0;
    6192              : 
    6193              :   /* The execution frequency of the node.  */
    6194              :   sreal weight = 0;
    6195              : 
    6196              :   /* The total execution frequency of all nodes that directly use the
    6197              :      result of this one.  */
    6198              :   sreal out_weight = 0;
    6199              : };
    6200              : 
    6201              : /* Information about one partition of the SLP graph, for use during
    6202              :    vect_optimize_slp_pass.  */
    6203              : 
    6204              : struct slpg_partition_info
    6205              : {
    6206              :   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
    6207              :      of m_partitioned_nodes.  */
    6208              :   unsigned int node_begin = 0;
    6209              :   unsigned int node_end = 0;
    6210              : 
    6211              :   /* Which layout we've chosen to use for this partition, or -1 if
    6212              :      we haven't picked one yet.  */
    6213              :   int layout = -1;
    6214              : 
    6215              :   /* The number of predecessors and successors in the partition dag.
    6216              :      The predecessors always have lower partition numbers and the
    6217              :      successors always have higher partition numbers.
    6218              : 
    6219              :      Note that the directions of these edges are not necessarily the
    6220              :      same as in the data flow graph.  For example, if an SCC has separate
    6221              :      partitions for an inner loop and an outer loop, the inner loop's
    6222              :      partition will have at least two incoming edges from the outer loop's
    6223              :      partition: one for a live-in value and one for a live-out value.
    6224              :      In data flow terms, one of these edges would also be from the outer loop
    6225              :      to the inner loop, but the other would be in the opposite direction.  */
    6226              :   unsigned int in_degree = 0;
    6227              :   unsigned int out_degree = 0;
    6228              : };
    6229              : 
    6230              : /* Information about the costs of using a particular layout for a
    6231              :    particular partition.  It can also say that the combination is
    6232              :    impossible.  */
    6233              : 
    6234              : struct slpg_partition_layout_costs
    6235              : {
    6236      1415138 :   bool is_possible () const { return internal_cost.is_possible (); }
    6237        49822 :   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
    6238              : 
    6239              :   /* The costs inherited from predecessor partitions.  */
    6240              :   slpg_layout_cost in_cost;
    6241              : 
    6242              :   /* The inherent cost of the layout within the node itself.  For example,
    6243              :      this is nonzero for a load if choosing a particular layout would require
    6244              :      the load to permute the loaded elements.  It is nonzero for a
    6245              :      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
    6246              :      to full-vector moves.  */
    6247              :   slpg_layout_cost internal_cost;
    6248              : 
    6249              :   /* The costs inherited from successor partitions.  */
    6250              :   slpg_layout_cost out_cost;
    6251              : };
    6252              : 
    6253              : /* This class tries to optimize the layout of vectors in order to avoid
    6254              :    unnecessary shuffling.  At the moment, the set of possible layouts are
    6255              :    restricted to bijective permutations.
    6256              : 
    6257              :    The goal of the pass depends on whether we're optimizing for size or
    6258              :    for speed.  When optimizing for size, the goal is to reduce the overall
    6259              :    number of layout changes (including layout changes implied by things
    6260              :    like load permutations).  When optimizing for speed, the goal is to
    6261              :    reduce the maximum latency attributable to layout changes on any
    6262              :    non-cyclical path through the data flow graph.
    6263              : 
    6264              :    For example, when optimizing a loop nest for speed, we will prefer
    6265              :    to make layout changes outside of a loop rather than inside of a loop,
    6266              :    and will prefer to make layout changes in parallel rather than serially,
    6267              :    even if that increases the overall number of layout changes.
    6268              : 
    6269              :    The high-level procedure is:
    6270              : 
    6271              :    (1) Build a graph in which edges go from uses (parents) to definitions
    6272              :        (children).
    6273              : 
    6274              :    (2) Divide the graph into a dag of strongly-connected components (SCCs).
    6275              : 
    6276              :    (3) When optimizing for speed, partition the nodes in each SCC based
    6277              :        on their containing cfg loop.  When optimizing for size, treat
    6278              :        each SCC as a single partition.
    6279              : 
    6280              :        This gives us a dag of partitions.  The goal is now to assign a
    6281              :        layout to each partition.
    6282              : 
    6283              :    (4) Construct a set of vector layouts that are worth considering.
    6284              :        Record which nodes must keep their current layout.
    6285              : 
    6286              :    (5) Perform a forward walk over the partition dag (from loads to stores)
    6287              :        accumulating the "forward" cost of using each layout.  When visiting
    6288              :        each partition, assign a tentative choice of layout to the partition
    6289              :        and use that choice when calculating the cost of using a different
    6290              :        layout in successor partitions.
    6291              : 
    6292              :    (6) Perform a backward walk over the partition dag (from stores to loads),
    6293              :        accumulating the "backward" cost of using each layout.  When visiting
    6294              :        each partition, make a final choice of layout for that partition based
    6295              :        on the accumulated forward costs (from (5)) and backward costs
    6296              :        (from (6)).
    6297              : 
    6298              :    (7) Apply the chosen layouts to the SLP graph.
    6299              : 
    6300              :    For example, consider the SLP statements:
    6301              : 
    6302              :    S1:      a_1 = load
    6303              :        loop:
    6304              :    S2:      a_2 = PHI<a_1, a_3>
    6305              :    S3:      b_1 = load
    6306              :    S4:      a_3 = a_2 + b_1
    6307              :        exit:
    6308              :    S5:      a_4 = PHI<a_3>
    6309              :    S6:      store a_4
    6310              : 
    6311              :    S2 and S4 form an SCC and are part of the same loop.  Every other
    6312              :    statement is in a singleton SCC.  In this example there is a one-to-one
    6313              :    mapping between SCCs and partitions and the partition dag looks like this;
    6314              : 
    6315              :         S1     S3
    6316              :          \     /
    6317              :           S2+S4
    6318              :             |
    6319              :            S5
    6320              :             |
    6321              :            S6
    6322              : 
    6323              :    S2, S3 and S4 will have a higher execution frequency than the other
    6324              :    statements, so when optimizing for speed, the goal is to avoid any
    6325              :    layout changes:
    6326              : 
    6327              :    - within S3
    6328              :    - within S2+S4
    6329              :    - on the S3->S2+S4 edge
    6330              : 
    6331              :    For example, if S3 was originally a reversing load, the goal of the
    6332              :    pass is to make it an unreversed load and change the layout on the
    6333              :    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
    6334              :    on S1->S2+S4 and S5->S6 would also be acceptable.)
    6335              : 
    6336              :    The difference between SCCs and partitions becomes important if we
    6337              :    add an outer loop:
    6338              : 
    6339              :    S1:      a_1 = ...
    6340              :        loop1:
    6341              :    S2:      a_2 = PHI<a_1, a_6>
    6342              :    S3:      b_1 = load
    6343              :    S4:      a_3 = a_2 + b_1
    6344              :        loop2:
    6345              :    S5:      a_4 = PHI<a_3, a_5>
    6346              :    S6:      c_1 = load
    6347              :    S7:      a_5 = a_4 + c_1
    6348              :        exit2:
    6349              :    S8:      a_6 = PHI<a_5>
    6350              :    S9:      store a_6
    6351              :        exit1:
    6352              : 
    6353              :    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
    6354              :    for speed, we usually do not want restrictions in the outer loop to "infect"
    6355              :    the decision for the inner loop.  For example, if an outer-loop node
    6356              :    in the SCC contains a statement with a fixed layout, that should not
    6357              :    prevent the inner loop from using a different layout.  Conversely,
    6358              :    the inner loop should not dictate a layout to the outer loop: if the
    6359              :    outer loop does a lot of computation, then it may not be efficient to
    6360              :    do all of that computation in the inner loop's preferred layout.
    6361              : 
    6362              :    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
    6363              :    and S5+S7 (inner).  We also try to arrange partitions so that:
    6364              : 
    6365              :    - the partition for an outer loop comes before the partition for
    6366              :      an inner loop
    6367              : 
    6368              :    - if a sibling loop A dominates a sibling loop B, A's partition
    6369              :      comes before B's
    6370              : 
    6371              :    This gives the following partition dag for the example above:
    6372              : 
    6373              :         S1        S3
    6374              :          \        /
    6375              :           S2+S4+S8   S6
    6376              :            |   \\    /
    6377              :            |    S5+S7
    6378              :            |
    6379              :           S9
    6380              : 
    6381              :    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
    6382              :    one for a reversal of the edge S7->S8.
    6383              : 
    6384              :    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
    6385              :    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
    6386              :    preferred layout against the cost of changing the layout on entry to the
    6387              :    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
    6388              : 
    6389              :    Although this works well when optimizing for speed, it has the downside
    6390              :    when optimizing for size that the choice of layout for S5+S7 is completely
    6391              :    independent of S9, which lessens the chance of reducing the overall number
    6392              :    of permutations.  We therefore do not partition SCCs when optimizing
    6393              :    for size.
    6394              : 
    6395              :    To give a concrete example of the difference between optimizing
    6396              :    for size and speed, consider:
    6397              : 
    6398              :    a[0] = (b[1] << c[3]) - d[1];
    6399              :    a[1] = (b[0] << c[2]) - d[0];
    6400              :    a[2] = (b[3] << c[1]) - d[3];
    6401              :    a[3] = (b[2] << c[0]) - d[2];
    6402              : 
    6403              :    There are three different layouts here: one for a, one for b and d,
    6404              :    and one for c.  When optimizing for speed it is better to permute each
    6405              :    of b, c and d into the order required by a, since those permutations
    6406              :    happen in parallel.  But when optimizing for size, it is better to:
    6407              : 
    6408              :    - permute c into the same order as b
    6409              :    - do the arithmetic
    6410              :    - permute the result into the order required by a
    6411              : 
    6412              :    This gives 2 permutations rather than 3.  */
    6413              : 
    6414              : class vect_optimize_slp_pass
    6415              : {
    6416              : public:
    6417       624215 :   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
    6418              :   void run ();
    6419              : 
    6420              : private:
    6421              :   /* Graph building.  */
    6422              :   struct loop *containing_loop (slp_tree);
    6423              :   bool is_cfg_latch_edge (graph_edge *);
    6424              :   void build_vertices (hash_set<slp_tree> &, slp_tree);
    6425              :   void build_vertices ();
    6426              :   void build_graph ();
    6427              : 
    6428              :   /* Partitioning.  */
    6429              :   void create_partitions ();
    6430              :   template<typename T> void for_each_partition_edge (unsigned int, T);
    6431              : 
    6432              :   /* Layout selection.  */
    6433              :   bool is_compatible_layout (slp_tree, unsigned int);
    6434              :   bool is_compatible_layout (const slpg_partition_info &, unsigned int);
    6435              :   int change_layout_cost (slp_tree, unsigned int, unsigned int);
    6436              :   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
    6437              :                                                        unsigned int);
    6438              :   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
    6439              :                                int, unsigned int);
    6440              :   int internal_node_cost (slp_tree, int, unsigned int);
    6441              :   void start_choosing_layouts ();
    6442              :   bool legitimize ();
    6443              : 
    6444              :   /* Cost propagation.  */
    6445              :   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
    6446              :                                      unsigned int, unsigned int);
    6447              :   slpg_layout_cost total_in_cost (unsigned int);
    6448              :   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
    6449              :   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
    6450              :   void forward_pass ();
    6451              :   void backward_pass ();
    6452              : 
    6453              :   /* Rematerialization.  */
    6454              :   slp_tree get_result_with_layout (slp_tree, unsigned int);
    6455              :   void materialize ();
    6456              : 
    6457              :   /* Clean-up.  */
    6458              :   void remove_redundant_permutations ();
    6459              : 
    6460              :   /* Masked load lanes discovery.  */
    6461              :   void decide_masked_load_lanes ();
    6462              : 
    6463              :   void dump ();
    6464              : 
    6465              :   vec_info *m_vinfo;
    6466              : 
    6467              :   /* True if we should optimize the graph for size, false if we should
    6468              :      optimize it for speed.  (It wouldn't be easy to make this decision
    6469              :      more locally.)  */
    6470              :   bool m_optimize_size;
    6471              : 
    6472              :   /* A graph of all SLP nodes, with edges leading from uses to definitions.
    6473              :      In other words, a node's predecessors are its slp_tree parents and
    6474              :      a node's successors are its slp_tree children.  */
    6475              :   graph *m_slpg = nullptr;
    6476              : 
    6477              :   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
    6478              :   auto_vec<slpg_vertex> m_vertices;
    6479              : 
    6480              :   /* The list of all leaves of M_SLPG. such as external definitions, constants,
    6481              :      and loads.  */
    6482              :   auto_vec<int> m_leafs;
    6483              : 
    6484              :   /* This array has one entry for every vector layout that we're considering.
    6485              :      Element 0 is null and indicates "no change".  Other entries describe
    6486              :      permutations that are inherent in the current graph and that we would
    6487              :      like to reverse if possible.
    6488              : 
    6489              :      For example, a permutation { 1, 2, 3, 0 } means that something has
    6490              :      effectively been permuted in that way, such as a load group
    6491              :      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
    6492              :      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
    6493              :      in order to put things "back" in order.  */
    6494              :   auto_vec<vec<unsigned> > m_perms;
    6495              : 
    6496              :   /* A partitioning of the nodes for which a layout must be chosen.
    6497              :      Each partition represents an <SCC, cfg loop> pair; that is,
    6498              :      nodes in different SCCs belong to different partitions, and nodes
    6499              :      within an SCC can be further partitioned according to a containing
    6500              :      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
    6501              : 
    6502              :      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
    6503              :        from leaves (such as loads) to roots (such as stores).
    6504              : 
    6505              :      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
    6506              :   auto_vec<slpg_partition_info> m_partitions;
    6507              : 
    6508              :   /* The list of all nodes for which a layout must be chosen.  Nodes for
    6509              :      partition P come before the nodes for partition P+1.  Nodes within a
    6510              :      partition are in reverse postorder.  */
    6511              :   auto_vec<unsigned int> m_partitioned_nodes;
    6512              : 
    6513              :   /* Index P * num-layouts + L contains the cost of using layout L
    6514              :      for partition P.  */
    6515              :   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
    6516              : 
    6517              :   /* Index N * num-layouts + L, if nonnull, is a node that provides the
    6518              :      original output of node N adjusted to have layout L.  */
    6519              :   auto_vec<slp_tree> m_node_layouts;
    6520              : };
    6521              : 
    6522              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
    6523              :    Also record whether we should optimize anything for speed rather
    6524              :    than size.  */
    6525              : 
    6526              : void
    6527      9727212 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
    6528              :                                         slp_tree node)
    6529              : {
    6530      9727212 :   unsigned i;
    6531      9727212 :   slp_tree child;
    6532              : 
    6533      9727212 :   if (visited.add (node))
    6534      9727212 :     return;
    6535              : 
    6536      9083528 :   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    6537              :     {
    6538      7044187 :       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
    6539      6262027 :       if (optimize_bb_for_speed_p (bb))
    6540      6144913 :         m_optimize_size = false;
    6541              :     }
    6542              : 
    6543      9083528 :   node->vertex = m_vertices.length ();
    6544      9083528 :   m_vertices.safe_push (slpg_vertex (node));
    6545              : 
    6546      9083528 :   bool leaf = true;
    6547      9083528 :   bool force_leaf = false;
    6548     16808002 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    6549      7724474 :     if (child)
    6550              :       {
    6551      6970640 :         leaf = false;
    6552      6970640 :         build_vertices (visited, child);
    6553              :       }
    6554              :     else
    6555              :       force_leaf = true;
    6556              :   /* Since SLP discovery works along use-def edges all cycles have an
    6557              :      entry - but there's the exception of cycles where we do not handle
    6558              :      the entry explicitly (but with a NULL SLP node), like some reductions
    6559              :      and inductions.  Force those SLP PHIs to act as leafs to make them
    6560              :      backwards reachable.  */
    6561      9083528 :   if (leaf || force_leaf)
    6562      4525802 :     m_leafs.safe_push (node->vertex);
    6563              : }
    6564              : 
    6565              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
    6566              : 
    6567              : void
    6568      1248430 : vect_optimize_slp_pass::build_vertices ()
    6569              : {
    6570      1248430 :   hash_set<slp_tree> visited;
    6571      1248430 :   unsigned i;
    6572      1248430 :   slp_instance instance;
    6573      1248430 :   m_vertices.truncate (0);
    6574      1248430 :   m_leafs.truncate (0);
    6575      6501862 :   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
    6576      2756572 :     build_vertices (visited, SLP_INSTANCE_TREE (instance));
    6577      1248430 : }
    6578              : 
    6579              : /* Apply (reverse) bijectite PERM to VEC.  */
    6580              : 
    6581              : template <class T>
    6582              : static void
    6583       190896 : vect_slp_permute (vec<unsigned> perm,
    6584              :                   vec<T> &vec, bool reverse)
    6585              : {
    6586       190896 :   auto_vec<T, 64> saved;
    6587       190896 :   saved.create (vec.length ());
    6588       623222 :   for (unsigned i = 0; i < vec.length (); ++i)
    6589       432326 :     saved.quick_push (vec[i]);
    6590              : 
    6591       190896 :   if (reverse)
    6592              :     {
    6593      1236597 :       for (unsigned i = 0; i < vec.length (); ++i)
    6594       431114 :         vec[perm[i]] = saved[i];
    6595       621472 :       for (unsigned i = 0; i < vec.length (); ++i)
    6596       759695 :         gcc_assert (vec[perm[i]] == saved[i]);
    6597              :     }
    6598              :   else
    6599              :     {
    6600         3500 :       for (unsigned i = 0; i < vec.length (); ++i)
    6601         1212 :         vec[i] = saved[perm[i]];
    6602       192108 :       for (unsigned i = 0; i < vec.length (); ++i)
    6603         1818 :         gcc_assert (vec[i] == saved[perm[i]]);
    6604              :     }
    6605       190896 : }
    6606              : 
    6607              : /* Return the cfg loop that contains NODE.  */
    6608              : 
    6609              : struct loop *
    6610      3425412 : vect_optimize_slp_pass::containing_loop (slp_tree node)
    6611              : {
    6612      3425412 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    6613      3425412 :   if (!rep)
    6614         4602 :     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
    6615      3821043 :   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
    6616              : }
    6617              : 
    6618              : /* Return true if UD (an edge from a use to a definition) is associated
    6619              :    with a loop latch edge in the cfg.  */
    6620              : 
    6621              : bool
    6622      6970640 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
    6623              : {
    6624      6970640 :   slp_tree use = m_vertices[ud->src].node;
    6625      6970640 :   slp_tree def = m_vertices[ud->dest].node;
    6626      6970640 :   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
    6627      6970640 :        || SLP_TREE_PERMUTE_P (use))
    6628      6679160 :       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
    6629              :     return false;
    6630              : 
    6631      3868608 :   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
    6632      3868608 :   return (is_a<gphi *> (use_rep->stmt)
    6633       319358 :           && bb_loop_header_p (gimple_bb (use_rep->stmt))
    6634      4025734 :           && containing_loop (def) == containing_loop (use));
    6635              : }
    6636              : 
    6637              : /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
    6638              :    a nonnull data field.  */
    6639              : 
    6640              : void
    6641      1248430 : vect_optimize_slp_pass::build_graph ()
    6642              : {
    6643      1248430 :   m_optimize_size = true;
    6644      1248430 :   build_vertices ();
    6645              : 
    6646      2496860 :   m_slpg = new_graph (m_vertices.length ());
    6647     12828818 :   for (slpg_vertex &v : m_vertices)
    6648     26805086 :     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
    6649      7724474 :       if (child)
    6650              :         {
    6651      6970640 :           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
    6652      6970640 :           if (is_cfg_latch_edge (ud))
    6653       148672 :             ud->data = this;
    6654              :         }
    6655      1248430 : }
    6656              : 
    6657              : /* Return true if E corresponds to a loop latch edge in the cfg.  */
    6658              : 
    6659              : static bool
    6660      3559387 : skip_cfg_latch_edges (graph_edge *e)
    6661              : {
    6662      3559387 :   return e->data;
    6663              : }
    6664              : 
    6665              : /* Create the node partitions.  */
    6666              : 
    6667              : void
    6668       624215 : vect_optimize_slp_pass::create_partitions ()
    6669              : {
    6670              :   /* Calculate a postorder of the graph, ignoring edges that correspond
    6671              :      to natural latch edges in the cfg.  Reading the vector from the end
    6672              :      to the beginning gives the reverse postorder.  */
    6673       624215 :   auto_vec<int> initial_rpo;
    6674      1248430 :   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
    6675              :                false, NULL, skip_cfg_latch_edges);
    6676      1872645 :   gcc_assert (initial_rpo.length () == m_vertices.length ());
    6677              : 
    6678              :   /* Calculate the strongly connected components of the graph.  */
    6679       624215 :   auto_vec<int> scc_grouping;
    6680       624215 :   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
    6681              : 
    6682              :   /* Create a new index order in which all nodes from the same SCC are
    6683              :      consecutive.  Use scc_pos to record the index of the first node in
    6684              :      each SCC.  */
    6685       624215 :   auto_vec<unsigned int> scc_pos (num_sccs);
    6686       624215 :   int last_component = -1;
    6687       624215 :   unsigned int node_count = 0;
    6688      6414142 :   for (unsigned int node_i : scc_grouping)
    6689              :     {
    6690      4541497 :       if (last_component != m_slpg->vertices[node_i].component)
    6691              :         {
    6692      4449831 :           last_component = m_slpg->vertices[node_i].component;
    6693      8899662 :           gcc_assert (last_component == int (scc_pos.length ()));
    6694      4449831 :           scc_pos.quick_push (node_count);
    6695              :         }
    6696      4541497 :       node_count += 1;
    6697              :     }
    6698      1248430 :   gcc_assert (node_count == initial_rpo.length ()
    6699              :               && last_component + 1 == int (num_sccs));
    6700              : 
    6701              :   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
    6702              :      inside each SCC following the RPO we calculated above.  The fact that
    6703              :      we ignored natural latch edges when calculating the RPO should ensure
    6704              :      that, for natural loop nests:
    6705              : 
    6706              :      - the first node that we encounter in a cfg loop is the loop header phi
    6707              :      - the loop header phis are in dominance order
    6708              : 
    6709              :      Arranging for this is an optimization (see below) rather than a
    6710              :      correctness issue.  Unnatural loops with a tangled mess of backedges
    6711              :      will still work correctly, but might give poorer results.
    6712              : 
    6713              :      Also update scc_pos so that it gives 1 + the index of the last node
    6714              :      in the SCC.  */
    6715       624215 :   m_partitioned_nodes.safe_grow (node_count);
    6716      5789927 :   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
    6717              :     {
    6718      4541497 :       unsigned int node_i = initial_rpo[old_i];
    6719      4541497 :       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
    6720      4541497 :       m_partitioned_nodes[new_i] = node_i;
    6721              :     }
    6722              : 
    6723              :   /* When optimizing for speed, partition each SCC based on the containing
    6724              :      cfg loop. The order we constructed above should ensure that, for natural
    6725              :      cfg loops, we'll create sub-SCC partitions for outer loops before
    6726              :      the corresponding sub-SCC partitions for inner loops.  Similarly,
    6727              :      when one sibling loop A dominates another sibling loop B, we should
    6728              :      create a sub-SCC partition for A before a sub-SCC partition for B.
    6729              : 
    6730              :      As above, nothing depends for correctness on whether this achieves
    6731              :      a natural nesting, but we should get better results when it does.  */
    6732      1248430 :   m_partitions.reserve (m_vertices.length ());
    6733       624215 :   unsigned int next_partition_i = 0;
    6734       624215 :   hash_map<struct loop *, int> loop_partitions;
    6735       624215 :   unsigned int rpo_begin = 0;
    6736       624215 :   unsigned int num_partitioned_nodes = 0;
    6737      6322476 :   for (unsigned int rpo_end : scc_pos)
    6738              :     {
    6739      4449831 :       loop_partitions.empty ();
    6740              :       unsigned int partition_i = next_partition_i;
    6741      8991328 :       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
    6742              :         {
    6743              :           /* Handle externals and constants optimistically throughout.
    6744              :              But treat existing vectors as fixed since we do not handle
    6745              :              permuting them.  */
    6746      4541497 :           unsigned int node_i = m_partitioned_nodes[rpo_i];
    6747      4541497 :           auto &vertex = m_vertices[node_i];
    6748      4541497 :           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
    6749       494727 :                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
    6750      4543735 :               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
    6751      1406148 :             vertex.partition = -1;
    6752              :           else
    6753              :             {
    6754      3135349 :               bool existed;
    6755      3135349 :               if (m_optimize_size)
    6756        24189 :                 existed = next_partition_i > partition_i;
    6757              :               else
    6758              :                 {
    6759      3111160 :                   struct loop *loop = containing_loop (vertex.node);
    6760      3111160 :                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
    6761      3111160 :                   if (!existed)
    6762      3020478 :                     entry = next_partition_i;
    6763      3111160 :                   partition_i = entry;
    6764              :                 }
    6765      3135349 :               if (!existed)
    6766              :                 {
    6767      3044589 :                   m_partitions.quick_push (slpg_partition_info ());
    6768      3044589 :                   next_partition_i += 1;
    6769              :                 }
    6770      3135349 :               vertex.partition = partition_i;
    6771      3135349 :               num_partitioned_nodes += 1;
    6772      3135349 :               m_partitions[partition_i].node_end += 1;
    6773              :             }
    6774              :         }
    6775      4449831 :       rpo_begin = rpo_end;
    6776              :     }
    6777              : 
    6778              :   /* Assign ranges of consecutive node indices to each partition,
    6779              :      in partition order.  Start with node_end being the same as
    6780              :      node_begin so that the next loop can use it as a counter.  */
    6781       624215 :   unsigned int node_begin = 0;
    6782      4917234 :   for (auto &partition : m_partitions)
    6783              :     {
    6784      3044589 :       partition.node_begin = node_begin;
    6785      3044589 :       node_begin += partition.node_end;
    6786      3044589 :       partition.node_end = partition.node_begin;
    6787              :     }
    6788       624215 :   gcc_assert (node_begin == num_partitioned_nodes);
    6789              : 
    6790              :   /* Finally build the list of nodes in partition order.  */
    6791       624215 :   m_partitioned_nodes.truncate (num_partitioned_nodes);
    6792      5165712 :   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
    6793              :     {
    6794      4541497 :       int partition_i = m_vertices[node_i].partition;
    6795      4541497 :       if (partition_i >= 0)
    6796              :         {
    6797      3135349 :           unsigned int order_i = m_partitions[partition_i].node_end++;
    6798      3135349 :           m_partitioned_nodes[order_i] = node_i;
    6799              :         }
    6800              :     }
    6801       624215 : }
    6802              : 
    6803              : /* Look for edges from earlier partitions into node NODE_I and edges from
    6804              :    node NODE_I into later partitions.  Call:
    6805              : 
    6806              :       FN (ud, other_node_i)
    6807              : 
    6808              :    for each such use-to-def edge ud, where other_node_i is the node at the
    6809              :    other end of the edge.  */
    6810              : 
    6811              : template<typename T>
    6812              : void
    6813      3524504 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
    6814              : {
    6815      3524504 :   int partition_i = m_vertices[node_i].partition;
    6816      3524504 :   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
    6817      5994510 :        pred; pred = pred->pred_next)
    6818              :     {
    6819      2470006 :       int src_partition_i = m_vertices[pred->src].partition;
    6820      2470006 :       if (src_partition_i >= 0 && src_partition_i != partition_i)
    6821      2244276 :         fn (pred, pred->src);
    6822              :     }
    6823      3524504 :   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
    6824      7547774 :        succ; succ = succ->succ_next)
    6825              :     {
    6826      4023270 :       int dest_partition_i = m_vertices[succ->dest].partition;
    6827      4023270 :       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
    6828      2266053 :         fn (succ, succ->dest);
    6829              :     }
    6830      3524504 : }
    6831              : 
    6832              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6833              :    that NODE would operate on.  This test is independent of NODE's actual
    6834              :    operation.  */
    6835              : 
    6836              : bool
    6837      1574170 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
    6838              :                                               unsigned int layout_i)
    6839              : {
    6840      1574170 :   if (layout_i == 0)
    6841              :     return true;
    6842              : 
    6843       912258 :   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
    6844        11596 :     return false;
    6845              : 
    6846              :   return true;
    6847              : }
    6848              : 
    6849              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6850              :    that NODE would operate on for each NODE in PARTITION.
    6851              :    This test is independent of NODE's actual operations.  */
    6852              : 
    6853              : bool
    6854        17263 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
    6855              :                                                 &partition,
    6856              :                                               unsigned int layout_i)
    6857              : {
    6858        34760 :   for (unsigned int order_i = partition.node_begin;
    6859        34760 :        order_i < partition.node_end; ++order_i)
    6860              :     {
    6861        17563 :       unsigned int node_i = m_partitioned_nodes[order_i];
    6862        17563 :       auto &vertex = m_vertices[node_i];
    6863              : 
    6864              :       /* The layout is incompatible if it is individually incompatible
    6865              :          with any node in the partition.  */
    6866        17563 :       if (!is_compatible_layout (vertex.node, layout_i))
    6867              :         return false;
    6868              :     }
    6869              :   return true;
    6870              : }
    6871              : 
    6872              : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
    6873              :    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
    6874              :    layouts is incompatible with NODE or if the change is not possible for
    6875              :    some other reason.
    6876              : 
    6877              :    The properties taken from NODE include the number of lanes and the
    6878              :    vector type.  The actual operation doesn't matter.  */
    6879              : 
    6880              : int
    6881       674851 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
    6882              :                                             unsigned int from_layout_i,
    6883              :                                             unsigned int to_layout_i)
    6884              : {
    6885       674851 :   if (!is_compatible_layout (node, from_layout_i)
    6886       674851 :       || !is_compatible_layout (node, to_layout_i))
    6887          569 :     return -1;
    6888              : 
    6889       674282 :   if (from_layout_i == to_layout_i)
    6890              :     return 0;
    6891              : 
    6892       292254 :   auto_vec<slp_tree, 1> children (1);
    6893       292254 :   children.quick_push (node);
    6894       292254 :   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
    6895       292254 :   if (from_layout_i > 0)
    6896       826424 :     for (unsigned int i : m_perms[from_layout_i])
    6897       363947 :       perm.quick_push ({ 0, i });
    6898              :   else
    6899       447104 :     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
    6900       309009 :       perm.quick_push ({ 0, i });
    6901       292254 :   if (to_layout_i > 0)
    6902       138522 :     vect_slp_permute (m_perms[to_layout_i], perm, true);
    6903       292254 :   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
    6904              :                                                children, false);
    6905       292254 :   if (count >= 0)
    6906       287786 :     return MAX (count, 1);
    6907              : 
    6908              :   /* ??? In principle we could try changing via layout 0, giving two
    6909              :      layout changes rather than 1.  Doing that would require
    6910              :      corresponding support in get_result_with_layout.  */
    6911              :   return -1;
    6912       292254 : }
    6913              : 
    6914              : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
    6915              : 
    6916              : inline slpg_partition_layout_costs &
    6917       972710 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
    6918              :                                                 unsigned int layout_i)
    6919              : {
    6920      1945420 :   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
    6921              : }
    6922              : 
    6923              : /* Change PERM in one of two ways:
    6924              : 
    6925              :    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
    6926              :      chosen for child I of NODE.
    6927              : 
    6928              :    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
    6929              : 
    6930              :    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
    6931              : 
    6932              : void
    6933        27867 : vect_optimize_slp_pass::
    6934              : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
    6935              :                         int in_layout_i, unsigned int out_layout_i)
    6936              : {
    6937       163837 :   for (auto &entry : perm)
    6938              :     {
    6939        80236 :       int this_in_layout_i = in_layout_i;
    6940        80236 :       if (this_in_layout_i < 0)
    6941              :         {
    6942        57281 :           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
    6943        57281 :           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
    6944        57281 :           if (in_partition_i == -1u)
    6945          329 :             continue;
    6946        56952 :           this_in_layout_i = m_partitions[in_partition_i].layout;
    6947              :         }
    6948        79907 :       if (this_in_layout_i > 0)
    6949        17441 :         entry.second = m_perms[this_in_layout_i][entry.second];
    6950              :     }
    6951        27867 :   if (out_layout_i > 0)
    6952         6305 :     vect_slp_permute (m_perms[out_layout_i], perm, true);
    6953        27867 : }
    6954              : 
    6955              : /* Check whether the target allows NODE to be rearranged so that the node's
    6956              :    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
    6957              :    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
    6958              : 
    6959              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
    6960              :    NODE can adapt to the layout changes that have (perhaps provisionally)
    6961              :    been chosen for NODE's children, so that no extra permutations are
    6962              :    needed on either the input or the output of NODE.
    6963              : 
    6964              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
    6965              :    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
    6966              : 
    6967              :    IN_LAYOUT_I has no meaning for other types of node.
    6968              : 
    6969              :    Keeping the node as-is is always valid.  If the target doesn't appear
    6970              :    to support the node as-is, but might realistically support other layouts,
    6971              :    then layout 0 instead has the cost of a worst-case permutation.  On the
    6972              :    one hand, this ensures that every node has at least one valid layout,
    6973              :    avoiding what would otherwise be an awkward special case.  On the other,
    6974              :    it still encourages the pass to change an invalid pre-existing layout
    6975              :    choice into a valid one.  */
    6976              : 
    6977              : int
    6978       206870 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
    6979              :                                             unsigned int out_layout_i)
    6980              : {
    6981       206870 :   const int fallback_cost = 1;
    6982              : 
    6983       206870 :   if (SLP_TREE_PERMUTE_P (node))
    6984              :     {
    6985        23544 :       auto_lane_permutation_t tmp_perm;
    6986        23544 :       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    6987              : 
    6988              :       /* Check that the child nodes support the chosen layout.  Checking
    6989              :          the first child is enough, since any second child would have the
    6990              :          same shape.  */
    6991        23544 :       auto first_child = SLP_TREE_CHILDREN (node)[0];
    6992        23544 :       if (in_layout_i > 0
    6993        23544 :           && !is_compatible_layout (first_child, in_layout_i))
    6994              :         return -1;
    6995              : 
    6996        22979 :       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
    6997        45958 :       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
    6998              :                                                   node, tmp_perm,
    6999        22979 :                                                   SLP_TREE_CHILDREN (node),
    7000              :                                                   false);
    7001        22979 :       if (count < 0)
    7002              :         {
    7003         1516 :           if (in_layout_i == 0 && out_layout_i == 0)
    7004              :             {
    7005              :               /* Use the fallback cost if the node could in principle support
    7006              :                  some nonzero layout for both the inputs and the outputs.
    7007              :                  Otherwise assume that the node will be rejected later
    7008              :                  and rebuilt from scalars.  */
    7009          369 :               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
    7010              :                 return fallback_cost;
    7011          299 :               return 0;
    7012              :             }
    7013              :           return -1;
    7014              :         }
    7015              : 
    7016              :       /* We currently have no way of telling whether the new layout is cheaper
    7017              :          or more expensive than the old one.  But at least in principle,
    7018              :          it should be worth making zero permutations (whole-vector shuffles)
    7019              :          cheaper than real permutations, in case the pass is able to remove
    7020              :          the latter.  */
    7021        21463 :       return count == 0 ? 0 : 1;
    7022        23544 :     }
    7023              : 
    7024       183326 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    7025       183326 :   if (rep
    7026       182387 :       && STMT_VINFO_DATA_REF (rep)
    7027        57723 :       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
    7028       224351 :       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7029              :     {
    7030        35160 :       auto_load_permutation_t tmp_perm;
    7031        35160 :       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7032        35160 :       if (out_layout_i > 0)
    7033        12277 :         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
    7034              : 
    7035        35160 :       poly_uint64 vf = 1;
    7036        35160 :       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
    7037         7972 :         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    7038        35160 :       unsigned int n_perms;
    7039        35160 :       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
    7040              :                                            nullptr, vf, true, false, &n_perms))
    7041              :         {
    7042         1481 :           auto rep = SLP_TREE_REPRESENTATIVE (node);
    7043         1481 :           if (out_layout_i == 0)
    7044              :             {
    7045              :               /* Use the fallback cost if the load is an N-to-N permutation.
    7046              :                  Otherwise assume that the node will be rejected later
    7047              :                  and rebuilt from scalars.  */
    7048         1078 :               if (STMT_VINFO_GROUPED_ACCESS (rep)
    7049         2156 :                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
    7050         1078 :                       == SLP_TREE_LANES (node)))
    7051          582 :                 return fallback_cost;
    7052              :               return 0;
    7053              :             }
    7054              :           return -1;
    7055              :         }
    7056              : 
    7057              :       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
    7058        33679 :       return n_perms == 0 ? 0 : 1;
    7059        35160 :     }
    7060              : 
    7061              :   return 0;
    7062              : }
    7063              : 
    7064              : /* Decide which element layouts we should consider using.  Calculate the
    7065              :    weights associated with inserting layout changes on partition edges.
    7066              :    Also mark partitions that cannot change layout, by setting their
    7067              :    layout to zero.  */
    7068              : 
    7069              : void
    7070       624215 : vect_optimize_slp_pass::start_choosing_layouts ()
    7071              : {
    7072              :   /* Used to assign unique permutation indices.  */
    7073       624215 :   using perm_hash = unbounded_hashmap_traits<
    7074              :     vec_free_hash_base<int_hash_base<unsigned>>,
    7075              :     int_hash<int, -1, -2>
    7076              :   >;
    7077       624215 :   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
    7078              : 
    7079              :   /* Layout 0 is "no change".  */
    7080       624215 :   m_perms.safe_push (vNULL);
    7081              : 
    7082              :   /* Create layouts from existing permutations.  */
    7083       624215 :   auto_load_permutation_t tmp_perm;
    7084      5007994 :   for (unsigned int node_i : m_partitioned_nodes)
    7085              :     {
    7086              :       /* Leafs also double as entries to the reverse graph.  Allow the
    7087              :          layout of those to be changed.  */
    7088      3135349 :       auto &vertex = m_vertices[node_i];
    7089      3135349 :       auto &partition = m_partitions[vertex.partition];
    7090      3135349 :       if (!m_slpg->vertices[node_i].succ)
    7091       792854 :         partition.layout = 0;
    7092              : 
    7093              :       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
    7094      3135349 :       slp_tree node = vertex.node;
    7095      3135349 :       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
    7096      3135349 :       slp_tree child;
    7097      3135349 :       unsigned HOST_WIDE_INT imin, imax = 0;
    7098      3135349 :       bool any_permute = false;
    7099      3135349 :       tmp_perm.truncate (0);
    7100      3135349 :       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7101              :         {
    7102              :           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
    7103              :              unpermuted, record a layout that reverses this permutation.
    7104              : 
    7105              :              We would need more work to cope with loads that are internally
    7106              :              permuted and also have inputs (such as masks for
    7107              :              IFN_MASK_LOADs).  */
    7108       521464 :           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
    7109       521464 :           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
    7110              :             {
    7111       357824 :               partition.layout = -1;
    7112      3119317 :               continue;
    7113              :             }
    7114       163640 :           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
    7115       163640 :           imin = DR_GROUP_SIZE (dr_stmt) + 1;
    7116       163640 :           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7117              :         }
    7118      5112701 :       else if (SLP_TREE_PERMUTE_P (node)
    7119       130270 :                && SLP_TREE_CHILDREN (node).length () == 1
    7120       115069 :                && (child = SLP_TREE_CHILDREN (node)[0])
    7121      2728954 :                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
    7122       115069 :                    .is_constant (&imin)))
    7123              :         {
    7124              :           /* If the child has the same vector size as this node,
    7125              :              reversing the permutation can make the permutation a no-op.
    7126              :              In other cases it can change a true permutation into a
    7127              :              full-vector extract.  */
    7128       115069 :           tmp_perm.reserve (SLP_TREE_LANES (node));
    7129       307183 :           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7130       192114 :             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
    7131              :         }
    7132              :       else
    7133      2498816 :         continue;
    7134              : 
    7135       734731 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7136              :         {
    7137       456022 :           unsigned idx = tmp_perm[j];
    7138       456022 :           imin = MIN (imin, idx);
    7139       456022 :           imax = MAX (imax, idx);
    7140       456022 :           if (idx - tmp_perm[0] != j)
    7141       132373 :             any_permute = true;
    7142              :         }
    7143              :       /* If the span doesn't match we'd disrupt VF computation, avoid
    7144              :          that for now.  */
    7145       278709 :       if (imax - imin + 1 != SLP_TREE_LANES (node))
    7146        79790 :         continue;
    7147              :       /* If there's no permute no need to split one out.  In this case
    7148              :          we can consider turning a load into a permuted load, if that
    7149              :          turns out to be cheaper than alternatives.  */
    7150       198919 :       if (!any_permute)
    7151              :         {
    7152       182753 :           partition.layout = -1;
    7153       182753 :           continue;
    7154              :         }
    7155              : 
    7156              :       /* For now only handle true permutes, like
    7157              :          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
    7158              :          when permuting constants and invariants keeping the permute
    7159              :          bijective.  */
    7160        16166 :       auto_sbitmap load_index (SLP_TREE_LANES (node));
    7161        16166 :       bitmap_clear (load_index);
    7162        62502 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7163        46336 :         bitmap_set_bit (load_index, tmp_perm[j] - imin);
    7164              :       unsigned j;
    7165        61826 :       for (j = 0; j < SLP_TREE_LANES (node); ++j)
    7166        45794 :         if (!bitmap_bit_p (load_index, j))
    7167              :           break;
    7168        16166 :       if (j != SLP_TREE_LANES (node))
    7169          134 :         continue;
    7170              : 
    7171        16032 :       vec<unsigned> perm = vNULL;
    7172        16032 :       perm.safe_grow (SLP_TREE_LANES (node), true);
    7173        61591 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7174        45559 :         perm[j] = tmp_perm[j] - imin;
    7175              : 
    7176        32064 :       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
    7177              :         {
    7178              :           /* Continue to use existing layouts, but don't add any more.  */
    7179            0 :           int *entry = layout_ids.get (perm);
    7180            0 :           partition.layout = entry ? *entry : 0;
    7181            0 :           perm.release ();
    7182              :         }
    7183              :       else
    7184              :         {
    7185        16032 :           bool existed;
    7186        16032 :           int &layout_i = layout_ids.get_or_insert (perm, &existed);
    7187        16032 :           if (existed)
    7188         5496 :             perm.release ();
    7189              :           else
    7190              :             {
    7191        10536 :               layout_i = m_perms.length ();
    7192        10536 :               m_perms.safe_push (perm);
    7193              :             }
    7194        16032 :           partition.layout = layout_i;
    7195              :         }
    7196        16166 :     }
    7197              : 
    7198              :   /* Initially assume that every layout is possible and has zero cost
    7199              :      in every partition.  */
    7200       624215 :   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
    7201      1248430 :                                               * m_perms.length ());
    7202              : 
    7203              :   /* We have to mark outgoing permutations facing non-associating-reduction
    7204              :      graph entries that are not represented as to be materialized.
    7205              :      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
    7206      3250931 :   for (slp_instance instance : m_vinfo->slp_instances)
    7207      1378286 :     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
    7208              :       {
    7209         5977 :         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7210         5977 :         m_partitions[m_vertices[node_i].partition].layout = 0;
    7211              :       }
    7212      1372309 :     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
    7213              :       {
    7214         1399 :         stmt_vec_info stmt_info
    7215         1399 :           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
    7216         1399 :         vect_reduc_info reduc_info
    7217         1399 :           = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
    7218              :                                 SLP_INSTANCE_TREE (instance));
    7219         1399 :         if (needs_fold_left_reduction_p (TREE_TYPE
    7220              :                                            (gimple_get_lhs (stmt_info->stmt)),
    7221              :                                          VECT_REDUC_INFO_CODE (reduc_info)))
    7222              :           {
    7223           64 :             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7224           64 :             m_partitions[m_vertices[node_i].partition].layout = 0;
    7225              :           }
    7226              :       }
    7227              : 
    7228              :   /* Check which layouts each node and partition can handle.  Calculate the
    7229              :      weights associated with inserting layout changes on edges.  */
    7230      5007994 :   for (unsigned int node_i : m_partitioned_nodes)
    7231              :     {
    7232      3135349 :       auto &vertex = m_vertices[node_i];
    7233      3135349 :       auto &partition = m_partitions[vertex.partition];
    7234      3135349 :       slp_tree node = vertex.node;
    7235              : 
    7236      3135349 :       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    7237              :         {
    7238      3130747 :           vertex.weight = vect_slp_node_weight (node);
    7239              : 
    7240              :           /* We do not handle stores with a permutation, so all
    7241              :              incoming permutations must have been materialized.
    7242              : 
    7243              :              We also don't handle masked grouped loads, which lack a
    7244              :              permutation vector.  In this case the memory locations
    7245              :              form an implicit second input to the loads, on top of the
    7246              :              explicit mask input, and the memory input's layout cannot
    7247              :              be changed.
    7248              : 
    7249              :              On the other hand, we do support permuting gather loads and
    7250              :              masked gather loads, where each scalar load is independent
    7251              :              of the others.  This can be useful if the address/index input
    7252              :              benefits from permutation.  */
    7253      3130747 :           if (STMT_VINFO_DATA_REF (rep)
    7254      1621698 :               && STMT_VINFO_GROUPED_ACCESS (rep)
    7255      4195154 :               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7256       900767 :             partition.layout = 0;
    7257              : 
    7258              :           /* We cannot change the layout of an operation that is
    7259              :              not independent on lanes.  Note this is an explicit
    7260              :              negative list since that's much shorter than the respective
    7261              :              positive one but it's critical to keep maintaining it.  */
    7262      3130747 :           if (is_gimple_call (STMT_VINFO_STMT (rep)))
    7263        23350 :             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
    7264              :               {
    7265         1071 :               case CFN_COMPLEX_ADD_ROT90:
    7266         1071 :               case CFN_COMPLEX_ADD_ROT270:
    7267         1071 :               case CFN_COMPLEX_MUL:
    7268         1071 :               case CFN_COMPLEX_MUL_CONJ:
    7269         1071 :               case CFN_VEC_ADDSUB:
    7270         1071 :               case CFN_VEC_FMADDSUB:
    7271         1071 :               case CFN_VEC_FMSUBADD:
    7272         1071 :                 partition.layout = 0;
    7273              :               default:;
    7274              :               }
    7275              :         }
    7276              : 
    7277      6943883 :       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
    7278              :         {
    7279      3808534 :           auto &other_vertex = m_vertices[other_node_i];
    7280              : 
    7281              :           /* Count the number of edges from earlier partitions and the number
    7282              :              of edges to later partitions.  */
    7283      3808534 :           if (other_vertex.partition < vertex.partition)
    7284      1904267 :             partition.in_degree += 1;
    7285              :           else
    7286      1904267 :             partition.out_degree += 1;
    7287              : 
    7288              :           /* If the current node uses the result of OTHER_NODE_I, accumulate
    7289              :              the effects of that.  */
    7290      3808534 :           if (ud->src == int (node_i))
    7291              :             {
    7292      1904267 :               other_vertex.out_weight += vertex.weight;
    7293      1904267 :               other_vertex.out_degree += 1;
    7294              :             }
    7295      6943883 :         };
    7296      3135349 :       for_each_partition_edge (node_i, process_edge);
    7297              :     }
    7298       624215 : }
    7299              : 
    7300              : /* Return the incoming costs for node NODE_I, assuming that each input keeps
    7301              :    its current (provisional) choice of layout.  The inputs do not necessarily
    7302              :    have the same layout as each other.  */
    7303              : 
    7304              : slpg_layout_cost
    7305         3116 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
    7306              : {
    7307         3116 :   auto &vertex = m_vertices[node_i];
    7308         3116 :   slpg_layout_cost cost;
    7309        11365 :   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
    7310              :     {
    7311         8249 :       auto &other_vertex = m_vertices[other_node_i];
    7312         8249 :       if (other_vertex.partition < vertex.partition)
    7313              :         {
    7314         5228 :           auto &other_partition = m_partitions[other_vertex.partition];
    7315        10456 :           auto &other_costs = partition_layout_costs (other_vertex.partition,
    7316         5228 :                                                       other_partition.layout);
    7317         5228 :           slpg_layout_cost this_cost = other_costs.in_cost;
    7318         5228 :           this_cost.add_serial_cost (other_costs.internal_cost);
    7319         5228 :           this_cost.split (other_partition.out_degree);
    7320         5228 :           cost.add_parallel_cost (this_cost);
    7321              :         }
    7322        11365 :     };
    7323         3116 :   for_each_partition_edge (node_i, add_cost);
    7324         3116 :   return cost;
    7325              : }
    7326              : 
    7327              : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
    7328              :    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
    7329              :    slpg_layout_cost::impossible () if the change isn't possible.  */
    7330              : 
    7331              : slpg_layout_cost
    7332       674851 : vect_optimize_slp_pass::
    7333              : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
    7334              :                   unsigned int layout2_i)
    7335              : {
    7336       674851 :   auto &def_vertex = m_vertices[ud->dest];
    7337       674851 :   auto &use_vertex = m_vertices[ud->src];
    7338       674851 :   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
    7339       674851 :   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
    7340       674851 :   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
    7341              :                                     use_layout_i);
    7342       674851 :   if (factor < 0)
    7343         5037 :     return slpg_layout_cost::impossible ();
    7344              : 
    7345              :   /* We have a choice of putting the layout change at the site of the
    7346              :      definition or at the site of the use.  Prefer the former when
    7347              :      optimizing for size or when the execution frequency of the
    7348              :      definition is no greater than the combined execution frequencies of
    7349              :      the uses.  When putting the layout change at the site of the definition,
    7350              :      divvy up the cost among all consumers.  */
    7351       669814 :   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
    7352              :     {
    7353       653368 :       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
    7354       653368 :       cost.split (def_vertex.out_degree);
    7355       653368 :       return cost;
    7356              :     }
    7357        16446 :   return { use_vertex.weight * factor, m_optimize_size };
    7358              : }
    7359              : 
    7360              : /* UD represents a use-def link between FROM_NODE_I and a node in a later
    7361              :    partition; FROM_NODE_I could be the definition node or the use node.
    7362              :    The node at the other end of the link wants to use layout TO_LAYOUT_I.
    7363              :    Return the cost of any necessary fix-ups on edge UD, or return
    7364              :    slpg_layout_cost::impossible () if the change isn't possible.
    7365              : 
    7366              :    At this point, FROM_NODE_I's partition has chosen the cheapest
    7367              :    layout based on the information available so far, but this choice
    7368              :    is only provisional.  */
    7369              : 
    7370              : slpg_layout_cost
    7371       177147 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
    7372              :                                       unsigned int to_layout_i)
    7373              : {
    7374       177147 :   auto &from_vertex = m_vertices[from_node_i];
    7375       177147 :   unsigned int from_partition_i = from_vertex.partition;
    7376       177147 :   slpg_partition_info &from_partition = m_partitions[from_partition_i];
    7377       177147 :   gcc_assert (from_partition.layout >= 0);
    7378              : 
    7379              :   /* First calculate the cost on the assumption that FROM_PARTITION sticks
    7380              :      with its current layout preference.  */
    7381       177147 :   slpg_layout_cost cost = slpg_layout_cost::impossible ();
    7382       177147 :   auto edge_cost = edge_layout_cost (ud, from_node_i,
    7383       177147 :                                      from_partition.layout, to_layout_i);
    7384       177147 :   if (edge_cost.is_possible ())
    7385              :     {
    7386       349028 :       auto &from_costs = partition_layout_costs (from_partition_i,
    7387       174514 :                                                  from_partition.layout);
    7388       174514 :       cost = from_costs.in_cost;
    7389       174514 :       cost.add_serial_cost (from_costs.internal_cost);
    7390       174514 :       cost.split (from_partition.out_degree);
    7391       174514 :       cost.add_serial_cost (edge_cost);
    7392              :     }
    7393         2633 :   else if (from_partition.layout == 0)
    7394              :     /* We must allow the source partition to have layout 0 as a fallback,
    7395              :        in case all other options turn out to be impossible.  */
    7396         2633 :     return cost;
    7397              : 
    7398              :   /* Take the minimum of that cost and the cost that applies if
    7399              :      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
    7400       174514 :   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
    7401              :                                                       to_layout_i);
    7402       174514 :   if (direct_layout_costs.is_possible ())
    7403              :     {
    7404       157992 :       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
    7405       157992 :       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
    7406       157992 :       direct_cost.split (from_partition.out_degree);
    7407       157992 :       if (!cost.is_possible ()
    7408       157992 :           || direct_cost.is_better_than (cost, m_optimize_size))
    7409        42096 :         cost = direct_cost;
    7410              :     }
    7411              : 
    7412       174514 :   return cost;
    7413              : }
    7414              : 
    7415              : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
    7416              :    partition; TO_NODE_I could be the definition node or the use node.
    7417              :    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
    7418              :    return the cost of any necessary fix-ups on edge UD, or
    7419              :    slpg_layout_cost::impossible () if the choice cannot be made.
    7420              : 
    7421              :    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
    7422              : 
    7423              : slpg_layout_cost
    7424       164395 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
    7425              :                                        unsigned int from_layout_i)
    7426              : {
    7427       164395 :   auto &to_vertex = m_vertices[to_node_i];
    7428       164395 :   unsigned int to_partition_i = to_vertex.partition;
    7429       164395 :   slpg_partition_info &to_partition = m_partitions[to_partition_i];
    7430       164395 :   gcc_assert (to_partition.layout >= 0);
    7431              : 
    7432              :   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
    7433              :      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
    7434              :      any other inputs keep their current choice of layout.  */
    7435       164395 :   auto &to_costs = partition_layout_costs (to_partition_i,
    7436              :                                            to_partition.layout);
    7437       164395 :   if (ud->src == int (to_node_i)
    7438       164233 :       && SLP_TREE_PERMUTE_P (to_vertex.node))
    7439              :     {
    7440         9275 :       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
    7441         9275 :       auto old_layout = from_partition.layout;
    7442         9275 :       from_partition.layout = from_layout_i;
    7443        18550 :       int factor = internal_node_cost (to_vertex.node, -1,
    7444         9275 :                                        to_partition.layout);
    7445         9275 :       from_partition.layout = old_layout;
    7446         9275 :       if (factor >= 0)
    7447              :         {
    7448         8643 :           slpg_layout_cost cost = to_costs.out_cost;
    7449        17286 :           cost.add_serial_cost ({ to_vertex.weight * factor,
    7450         8643 :                                   m_optimize_size });
    7451         8643 :           cost.split (to_partition.in_degree);
    7452         8643 :           return cost;
    7453              :         }
    7454              :     }
    7455              : 
    7456              :   /* Compute the cost if we insert any necessary layout change on edge UD.  */
    7457       155752 :   auto edge_cost = edge_layout_cost (ud, to_node_i,
    7458       155752 :                                      to_partition.layout, from_layout_i);
    7459       155752 :   if (edge_cost.is_possible ())
    7460              :     {
    7461       155752 :       slpg_layout_cost cost = to_costs.out_cost;
    7462       155752 :       cost.add_serial_cost (to_costs.internal_cost);
    7463       155752 :       cost.split (to_partition.in_degree);
    7464       155752 :       cost.add_serial_cost (edge_cost);
    7465       155752 :       return cost;
    7466              :     }
    7467              : 
    7468            0 :   return slpg_layout_cost::impossible ();
    7469              : }
    7470              : 
    7471              : /* Make a forward pass through the partitions, accumulating input costs.
    7472              :    Make a tentative (provisional) choice of layout for each partition,
    7473              :    ensuring that this choice still allows later partitions to keep
    7474              :    their original layout.  */
    7475              : 
    7476              : void
    7477         5251 : vect_optimize_slp_pass::forward_pass ()
    7478              : {
    7479       113390 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    7480              :        ++partition_i)
    7481              :     {
    7482       108139 :       auto &partition = m_partitions[partition_i];
    7483              : 
    7484              :       /* If the partition consists of a single VEC_PERM_EXPR, precompute
    7485              :          the incoming cost that would apply if every predecessor partition
    7486              :          keeps its current layout.  This is used within the loop below.  */
    7487       108139 :       slpg_layout_cost in_cost;
    7488       108139 :       slp_tree single_node = nullptr;
    7489       108139 :       if (partition.node_end == partition.node_begin + 1)
    7490              :         {
    7491       104297 :           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
    7492       104297 :           single_node = m_vertices[node_i].node;
    7493       104297 :           if (SLP_TREE_PERMUTE_P (single_node))
    7494         3116 :             in_cost = total_in_cost (node_i);
    7495              :         }
    7496              : 
    7497              :       /* Go through the possible layouts.  Decide which ones are valid
    7498              :          for this partition and record which of the valid layouts has
    7499              :          the lowest cost.  */
    7500       108139 :       unsigned int min_layout_i = 0;
    7501       108139 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7502       330308 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7503              :         {
    7504       222169 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7505       222169 :           if (!layout_costs.is_possible ())
    7506        49822 :             continue;
    7507              : 
    7508              :           /* If the recorded layout is already 0 then the layout cannot
    7509              :              change.  */
    7510       222169 :           if (partition.layout == 0 && layout_i != 0)
    7511              :             {
    7512        36153 :               layout_costs.mark_impossible ();
    7513        36153 :               continue;
    7514              :             }
    7515              : 
    7516       186016 :           bool is_possible = true;
    7517       377155 :           for (unsigned int order_i = partition.node_begin;
    7518       377155 :                order_i < partition.node_end; ++order_i)
    7519              :             {
    7520       202573 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7521       202573 :               auto &vertex = m_vertices[node_i];
    7522              : 
    7523              :               /* Reject the layout if it is individually incompatible
    7524              :                  with any node in the partition.  */
    7525       202573 :               if (!is_compatible_layout (vertex.node, layout_i))
    7526              :                 {
    7527        10396 :                   is_possible = false;
    7528        11434 :                   break;
    7529              :                 }
    7530              : 
    7531       536891 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7532              :                 {
    7533       344714 :                   auto &other_vertex = m_vertices[other_node_i];
    7534       344714 :                   if (other_vertex.partition < vertex.partition)
    7535              :                     {
    7536              :                       /* Accumulate the incoming costs from earlier
    7537              :                          partitions, plus the cost of any layout changes
    7538              :                          on UD itself.  */
    7539       177147 :                       auto cost = forward_cost (ud, other_node_i, layout_i);
    7540       177147 :                       if (!cost.is_possible ())
    7541         2633 :                         is_possible = false;
    7542              :                       else
    7543       174514 :                         layout_costs.in_cost.add_parallel_cost (cost);
    7544              :                     }
    7545              :                   else
    7546              :                     /* Reject the layout if it would make layout 0 impossible
    7547              :                        for later partitions.  This amounts to testing that the
    7548              :                        target supports reversing the layout change on edges
    7549              :                        to later partitions.
    7550              : 
    7551              :                        In principle, it might be possible to push a layout
    7552              :                        change all the way down a graph, so that it never
    7553              :                        needs to be reversed and so that the target doesn't
    7554              :                        need to support the reverse operation.  But it would
    7555              :                        be awkward to bail out if we hit a partition that
    7556              :                        does not support the new layout, especially since
    7557              :                        we are not dealing with a lattice.  */
    7558       167567 :                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
    7559       167567 :                                                      layout_i).is_possible ();
    7560       536891 :                 };
    7561       192177 :               for_each_partition_edge (node_i, add_cost);
    7562              : 
    7563              :               /* Accumulate the cost of using LAYOUT_I within NODE,
    7564              :                  both for the inputs and the outputs.  */
    7565       192177 :               int factor = internal_node_cost (vertex.node, layout_i,
    7566              :                                                layout_i);
    7567       192177 :               if (factor < 0)
    7568              :                 {
    7569         1038 :                   is_possible = false;
    7570         1038 :                   break;
    7571              :                 }
    7572       191139 :               else if (factor)
    7573        31370 :                 layout_costs.internal_cost.add_serial_cost
    7574        31370 :                   ({ vertex.weight * factor, m_optimize_size });
    7575              :             }
    7576       186016 :           if (!is_possible)
    7577              :             {
    7578        13669 :               layout_costs.mark_impossible ();
    7579        13669 :               continue;
    7580              :             }
    7581              : 
    7582              :           /* Combine the incoming and partition-internal costs.  */
    7583       172347 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7584       172347 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7585              : 
    7586              :           /* If this partition consists of a single VEC_PERM_EXPR, see
    7587              :              if the VEC_PERM_EXPR can be changed to support output layout
    7588              :              LAYOUT_I while keeping all the provisional choices of input
    7589              :              layout.  */
    7590       172347 :           if (single_node && SLP_TREE_PERMUTE_P (single_node))
    7591              :             {
    7592         5418 :               int factor = internal_node_cost (single_node, -1, layout_i);
    7593         5418 :               if (factor >= 0)
    7594              :                 {
    7595         4973 :                   auto weight = m_vertices[single_node->vertex].weight;
    7596         4973 :                   slpg_layout_cost internal_cost
    7597         4973 :                     = { weight * factor, m_optimize_size };
    7598              : 
    7599         4973 :                   slpg_layout_cost alt_cost = in_cost;
    7600         4973 :                   alt_cost.add_serial_cost (internal_cost);
    7601         4973 :                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
    7602              :                     {
    7603         1577 :                       combined_cost = alt_cost;
    7604         1577 :                       layout_costs.in_cost = in_cost;
    7605         1577 :                       layout_costs.internal_cost = internal_cost;
    7606              :                     }
    7607              :                 }
    7608              :             }
    7609              : 
    7610              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7611              :              the event of a tie between it and another layout.  */
    7612       172347 :           if (!min_layout_cost.is_possible ()
    7613        64208 :               || combined_cost.is_better_than (min_layout_cost,
    7614        64208 :                                                m_optimize_size))
    7615              :             {
    7616       121530 :               min_layout_i = layout_i;
    7617       121530 :               min_layout_cost = combined_cost;
    7618              :             }
    7619              :         }
    7620              : 
    7621              :       /* This loop's handling of earlier partitions should ensure that
    7622              :          choosing the original layout for the current partition is no
    7623              :          less valid than it was in the original graph, even with the
    7624              :          provisional layout choices for those earlier partitions.  */
    7625       108139 :       gcc_assert (min_layout_cost.is_possible ());
    7626       108139 :       partition.layout = min_layout_i;
    7627              :     }
    7628         5251 : }
    7629              : 
    7630              : /* Make a backward pass through the partitions, accumulating output costs.
    7631              :    Make a final choice of layout for each partition.  */
    7632              : 
    7633              : void
    7634         5251 : vect_optimize_slp_pass::backward_pass ()
    7635              : {
    7636       118641 :   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
    7637              :     {
    7638       108139 :       auto &partition = m_partitions[partition_i];
    7639              : 
    7640       108139 :       unsigned int min_layout_i = 0;
    7641       108139 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7642       330308 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7643              :         {
    7644       222169 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7645       222169 :           if (!layout_costs.is_possible ())
    7646        49822 :             continue;
    7647              : 
    7648              :           /* Accumulate the costs from successor partitions.  */
    7649       172347 :           bool is_possible = true;
    7650       361220 :           for (unsigned int order_i = partition.node_begin;
    7651       361220 :                order_i < partition.node_end; ++order_i)
    7652              :             {
    7653       188873 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7654       188873 :               auto &vertex = m_vertices[node_i];
    7655       527653 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7656              :                 {
    7657       338780 :                   auto &other_vertex = m_vertices[other_node_i];
    7658       338780 :                   auto &other_partition = m_partitions[other_vertex.partition];
    7659       338780 :                   if (other_vertex.partition > vertex.partition)
    7660              :                     {
    7661              :                       /* Accumulate the incoming costs from later
    7662              :                          partitions, plus the cost of any layout changes
    7663              :                          on UD itself.  */
    7664       164395 :                       auto cost = backward_cost (ud, other_node_i, layout_i);
    7665       164395 :                       if (!cost.is_possible ())
    7666            0 :                         is_possible = false;
    7667              :                       else
    7668       164395 :                         layout_costs.out_cost.add_parallel_cost (cost);
    7669              :                     }
    7670              :                   else
    7671              :                     /* Make sure that earlier partitions can (if necessary
    7672              :                        or beneficial) keep the layout that they chose in
    7673              :                        the forward pass.  This ensures that there is at
    7674              :                        least one valid choice of layout.  */
    7675       174385 :                     is_possible &= edge_layout_cost (ud, other_node_i,
    7676       174385 :                                                      other_partition.layout,
    7677       174385 :                                                      layout_i).is_possible ();
    7678       527653 :                 };
    7679       188873 :               for_each_partition_edge (node_i, add_cost);
    7680              :             }
    7681       172347 :           if (!is_possible)
    7682              :             {
    7683            0 :               layout_costs.mark_impossible ();
    7684            0 :               continue;
    7685              :             }
    7686              : 
    7687              :           /* Locally combine the costs from the forward and backward passes.
    7688              :              (This combined cost is not passed on, since that would lead
    7689              :              to double counting.)  */
    7690       172347 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7691       172347 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7692       172347 :           combined_cost.add_serial_cost (layout_costs.out_cost);
    7693              : 
    7694              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7695              :              the event of a tie between it and another layout.  */
    7696       172347 :           if (!min_layout_cost.is_possible ()
    7697        64208 :               || combined_cost.is_better_than (min_layout_cost,
    7698        64208 :                                                m_optimize_size))
    7699              :             {
    7700       116056 :               min_layout_i = layout_i;
    7701       116056 :               min_layout_cost = combined_cost;
    7702              :             }
    7703              :         }
    7704              : 
    7705       108139 :       gcc_assert (min_layout_cost.is_possible ());
    7706       108139 :       partition.layout = min_layout_i;
    7707              :     }
    7708         5251 : }
    7709              : 
    7710              : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
    7711              :    NODE already has the layout that was selected for its partition.  */
    7712              : 
    7713              : slp_tree
    7714       145124 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
    7715              :                                                 unsigned int to_layout_i)
    7716              : {
    7717       145124 :   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
    7718       145124 :   slp_tree result = m_node_layouts[result_i];
    7719       145124 :   if (result)
    7720              :     return result;
    7721              : 
    7722       144658 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    7723       144658 :       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
    7724              :           /* We can't permute vector defs in place.  */
    7725        20187 :           && SLP_TREE_VEC_DEFS (node).is_empty ()))
    7726              :     {
    7727              :       /* If the vector is uniform or unchanged, there's nothing to do.  */
    7728        37515 :       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
    7729              :         result = node;
    7730              :       else
    7731              :         {
    7732         1956 :           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
    7733         1956 :           result = vect_create_new_slp_node (scalar_ops);
    7734         1956 :           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
    7735              :         }
    7736              :     }
    7737              :   else
    7738              :     {
    7739       107143 :       unsigned int partition_i = m_vertices[node->vertex].partition;
    7740       107143 :       unsigned int from_layout_i = m_partitions[partition_i].layout;
    7741       107143 :       if (from_layout_i == to_layout_i)
    7742       106607 :         return node;
    7743              : 
    7744              :       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
    7745              :          permutation instead of a serial one.  Leave the new permutation
    7746              :          in TMP_PERM on success.  */
    7747          536 :       auto_lane_permutation_t tmp_perm;
    7748          536 :       unsigned int num_inputs = 1;
    7749          536 :       if (SLP_TREE_PERMUTE_P (node))
    7750              :         {
    7751            7 :           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7752            7 :           if (from_layout_i != 0)
    7753            7 :             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
    7754            7 :           if (to_layout_i != 0)
    7755            4 :             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
    7756            7 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7757              :                                               tmp_perm,
    7758            7 :                                               SLP_TREE_CHILDREN (node),
    7759              :                                               false) >= 0)
    7760            7 :             num_inputs = SLP_TREE_CHILDREN (node).length ();
    7761              :           else
    7762            0 :             tmp_perm.truncate (0);
    7763              :         }
    7764              : 
    7765          536 :       if (dump_enabled_p ())
    7766              :         {
    7767           68 :           if (tmp_perm.length () > 0)
    7768            6 :             dump_printf_loc (MSG_NOTE, vect_location,
    7769              :                              "duplicating permutation node %p with"
    7770              :                              " layout %d\n",
    7771              :                              (void *) node, to_layout_i);
    7772              :           else
    7773           62 :             dump_printf_loc (MSG_NOTE, vect_location,
    7774              :                              "inserting permutation node in place of %p\n",
    7775              :                              (void *) node);
    7776              :         }
    7777              : 
    7778          536 :       unsigned int num_lanes = SLP_TREE_LANES (node);
    7779          536 :       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
    7780          536 :       if (SLP_TREE_SCALAR_STMTS (node).length ())
    7781              :         {
    7782          535 :           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
    7783          535 :           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
    7784          535 :           if (from_layout_i != 0)
    7785          269 :             vect_slp_permute (m_perms[from_layout_i], stmts, false);
    7786          535 :           if (to_layout_i != 0)
    7787          270 :             vect_slp_permute (m_perms[to_layout_i], stmts, true);
    7788              :         }
    7789          536 :       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
    7790          536 :       SLP_TREE_LANES (result) = num_lanes;
    7791          536 :       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
    7792          536 :       result->vertex = -1;
    7793              : 
    7794          536 :       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
    7795          536 :       if (tmp_perm.length ())
    7796              :         {
    7797            7 :           lane_perm.safe_splice (tmp_perm);
    7798            7 :           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
    7799              :         }
    7800              :       else
    7801              :         {
    7802          529 :           lane_perm.create (num_lanes);
    7803         1651 :           for (unsigned j = 0; j < num_lanes; ++j)
    7804         1122 :             lane_perm.quick_push ({ 0, j });
    7805          529 :           if (from_layout_i != 0)
    7806          262 :             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
    7807          529 :           if (to_layout_i != 0)
    7808          267 :             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
    7809          529 :           SLP_TREE_CHILDREN (result).safe_push (node);
    7810              :         }
    7811         2148 :       for (slp_tree child : SLP_TREE_CHILDREN (result))
    7812          540 :         child->refcnt++;
    7813          536 :     }
    7814        38051 :   m_node_layouts[result_i] = result;
    7815        38051 :   return result;
    7816              : }
    7817              : 
    7818              : /* Apply the chosen vector layouts to the SLP graph.  */
    7819              : 
    7820              : void
    7821        10117 : vect_optimize_slp_pass::materialize ()
    7822              : {
    7823              :   /* We no longer need the costs, so avoid having two O(N * P) arrays
    7824              :      live at the same time.  */
    7825        10117 :   m_partition_layout_costs.release ();
    7826        30351 :   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
    7827              : 
    7828        20234 :   auto_sbitmap fully_folded (m_vertices.length ());
    7829        10117 :   bitmap_clear (fully_folded);
    7830       155359 :   for (unsigned int node_i : m_partitioned_nodes)
    7831              :     {
    7832       125008 :       auto &vertex = m_vertices[node_i];
    7833       125008 :       slp_tree node = vertex.node;
    7834       125008 :       int layout_i = m_partitions[vertex.partition].layout;
    7835       125008 :       gcc_assert (layout_i >= 0);
    7836              : 
    7837              :       /* Rearrange the scalar statements to match the chosen layout.  */
    7838       125008 :       if (layout_i > 0)
    7839        15430 :         vect_slp_permute (m_perms[layout_i],
    7840        15430 :                           SLP_TREE_SCALAR_STMTS (node), true);
    7841              : 
    7842              :       /* Update load and lane permutations.  */
    7843       125008 :       if (SLP_TREE_PERMUTE_P (node))
    7844              :         {
    7845              :           /* First try to absorb the input vector layouts.  If that fails,
    7846              :              force the inputs to have layout LAYOUT_I too.  We checked that
    7847              :              that was possible before deciding to use nonzero output layouts.
    7848              :              (Note that at this stage we don't really have any guarantee that
    7849              :              the target supports the original VEC_PERM_EXPR.)  */
    7850         4519 :           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
    7851         4519 :           auto_lane_permutation_t tmp_perm;
    7852         4519 :           tmp_perm.safe_splice (perm);
    7853         4519 :           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
    7854         4519 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7855              :                                               tmp_perm,
    7856         4519 :                                               SLP_TREE_CHILDREN (node),
    7857              :                                               false) >= 0)
    7858              :             {
    7859         4150 :               if (dump_enabled_p ()
    7860         5042 :                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
    7861              :                                   perm.begin ()))
    7862           58 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7863              :                                  "absorbing input layouts into %p\n",
    7864              :                                  (void *) node);
    7865        23827 :               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
    7866         4150 :               bitmap_set_bit (fully_folded, node_i);
    7867              :             }
    7868              :           else
    7869              :             {
    7870              :               /* Not MSG_MISSED because it would make no sense to users.  */
    7871          369 :               if (dump_enabled_p ())
    7872           46 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7873              :                                  "failed to absorb input layouts into %p\n",
    7874              :                                  (void *) node);
    7875          369 :               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
    7876              :             }
    7877         4519 :         }
    7878              :       else
    7879              :         {
    7880       120489 :           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
    7881       120489 :           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
    7882       120489 :           if (layout_i > 0)
    7883              :             /* ???  When we handle non-bijective permutes the idea
    7884              :                is that we can force the load-permutation to be
    7885              :                { min, min + 1, min + 2, ... max }.  But then the
    7886              :                scalar defs might no longer match the lane content
    7887              :                which means wrong-code with live lane vectorization.
    7888              :                So we possibly have to have NULL entries for those.  */
    7889        15327 :             vect_slp_permute (m_perms[layout_i], load_perm, true);
    7890              :         }
    7891              :     }
    7892              : 
    7893              :   /* Do this before any nodes disappear, since it involves a walk
    7894              :      over the leaves.  */
    7895        10117 :   remove_redundant_permutations ();
    7896              : 
    7897              :   /* Replace each child with a correctly laid-out version.  */
    7898       155359 :   for (unsigned int node_i : m_partitioned_nodes)
    7899              :     {
    7900              :       /* Skip nodes that have already been handled above.  */
    7901       125008 :       if (bitmap_bit_p (fully_folded, node_i))
    7902         4150 :         continue;
    7903              : 
    7904       120858 :       auto &vertex = m_vertices[node_i];
    7905       120858 :       int in_layout_i = m_partitions[vertex.partition].layout;
    7906       120858 :       gcc_assert (in_layout_i >= 0);
    7907              : 
    7908              :       unsigned j;
    7909              :       slp_tree child;
    7910       359703 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
    7911              :         {
    7912       148781 :           if (!child)
    7913         3657 :             continue;
    7914              : 
    7915       145124 :           slp_tree new_child = get_result_with_layout (child, in_layout_i);
    7916       145124 :           if (new_child != child)
    7917              :             {
    7918         2701 :               vect_free_slp_tree (child);
    7919         2701 :               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
    7920         2701 :               new_child->refcnt += 1;
    7921              :             }
    7922              :         }
    7923              :     }
    7924        10117 : }
    7925              : 
    7926              : /* Elide load permutations that are not necessary.  Such permutations might
    7927              :    be pre-existing, rather than created by the layout optimizations.  */
    7928              : 
    7929              : void
    7930       624215 : vect_optimize_slp_pass::remove_redundant_permutations ()
    7931              : {
    7932      4135546 :   for (unsigned int node_i : m_leafs)
    7933              :     {
    7934      2262901 :       slp_tree node = m_vertices[node_i].node;
    7935      2262901 :       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7936      1741437 :         continue;
    7937              : 
    7938              :       /* In basic block vectorization we allow any subchain of an interleaving
    7939              :          chain.
    7940              :          FORNOW: not in loop SLP because of realignment complications.  */
    7941       521464 :       if (is_a <bb_vec_info> (m_vinfo))
    7942              :         {
    7943       155631 :           bool subchain_p = true;
    7944              :           stmt_vec_info next_load_info = NULL;
    7945              :           stmt_vec_info load_info;
    7946              :           unsigned j;
    7947       155631 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    7948              :             {
    7949       126818 :               if (j != 0
    7950       126818 :                   && (next_load_info != load_info
    7951        59901 :                       || ! load_info
    7952        59901 :                       || DR_GROUP_GAP (load_info) != 1))
    7953              :                 {
    7954              :                   subchain_p = false;
    7955              :                   break;
    7956              :                 }
    7957       104295 :               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
    7958              :             }
    7959        51336 :           if (subchain_p)
    7960              :             {
    7961        28813 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    7962        28813 :               continue;
    7963              :             }
    7964              :         }
    7965              :       else
    7966              :         {
    7967       470128 :           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
    7968       470128 :           bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
    7969              :           /* When this isn't a grouped access we know it's single element
    7970              :              and contiguous.  */
    7971       470128 :           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
    7972              :             {
    7973       357824 :               if (!this_load_permuted
    7974       357824 :                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    7975       357210 :                       || SLP_TREE_LANES (node) == 1))
    7976       357199 :                 SLP_TREE_LOAD_PERMUTATION (node).release ();
    7977       357824 :               continue;
    7978              :             }
    7979       112304 :           stmt_vec_info first_stmt_info
    7980       112304 :             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
    7981       112705 :           if (!this_load_permuted
    7982              :               /* The load requires permutation when unrolling exposes
    7983              :                  a gap either because the group is larger than the SLP
    7984              :                  group-size or because there is a gap between the groups.  */
    7985       112304 :               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    7986        95057 :                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
    7987          124 :                       && DR_GROUP_GAP (first_stmt_info) == 0)))
    7988              :             {
    7989          401 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    7990          401 :               continue;
    7991              :             }
    7992              :         }
    7993              :     }
    7994       624215 : }
    7995              : 
    7996              : /* Print the partition graph and layout information to the dump file.  */
    7997              : 
    7998              : void
    7999          659 : vect_optimize_slp_pass::dump ()
    8000              : {
    8001          659 :   dump_printf_loc (MSG_NOTE, vect_location,
    8002              :                    "SLP optimize permutations:\n");
    8003         1331 :   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
    8004              :     {
    8005          672 :       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
    8006          672 :       const char *sep = "";
    8007         5769 :       for (unsigned int idx : m_perms[layout_i])
    8008              :         {
    8009         3753 :           dump_printf (MSG_NOTE, "%s%d", sep, idx);
    8010         3753 :           sep = ", ";
    8011              :         }
    8012          672 :       dump_printf (MSG_NOTE, " }\n");
    8013              :     }
    8014          659 :   dump_printf_loc (MSG_NOTE, vect_location,
    8015              :                    "SLP optimize partitions:\n");
    8016         5420 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    8017              :        ++partition_i)
    8018              :     {
    8019         4761 :       auto &partition = m_partitions[partition_i];
    8020         4761 :       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
    8021         4761 :       dump_printf_loc (MSG_NOTE, vect_location,
    8022              :                        "  partition %d (layout %d):\n",
    8023              :                        partition_i, partition.layout);
    8024         4761 :       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
    8025         9750 :       for (unsigned int order_i = partition.node_begin;
    8026         9750 :            order_i < partition.node_end; ++order_i)
    8027              :         {
    8028         4989 :           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
    8029         9978 :           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
    8030         4989 :                            (void *) vertex.node);
    8031         4989 :           dump_printf_loc (MSG_NOTE, vect_location,
    8032              :                            "          weight: %f\n",
    8033              :                            vertex.weight.to_double ());
    8034         4989 :           if (vertex.out_degree)
    8035         3888 :             dump_printf_loc (MSG_NOTE, vect_location,
    8036              :                              "          out weight: %f (degree %d)\n",
    8037              :                              vertex.out_weight.to_double (),
    8038              :                              vertex.out_degree);
    8039         4989 :           if (SLP_TREE_PERMUTE_P (vertex.node))
    8040          492 :             dump_printf_loc (MSG_NOTE, vect_location,
    8041              :                              "          op: VEC_PERM_EXPR\n");
    8042         4497 :           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
    8043         4479 :             dump_printf_loc (MSG_NOTE, vect_location,
    8044              :                              "          op template: %G", rep->stmt);
    8045              :         }
    8046         4761 :       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
    8047         9750 :       for (unsigned int order_i = partition.node_begin;
    8048         9750 :            order_i < partition.node_end; ++order_i)
    8049              :         {
    8050         4989 :           unsigned int node_i = m_partitioned_nodes[order_i];
    8051         4989 :           auto &vertex = m_vertices[node_i];
    8052        15041 :           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
    8053              :             {
    8054        10052 :               auto &other_vertex = m_vertices[other_node_i];
    8055        10052 :               if (other_vertex.partition < vertex.partition)
    8056         5026 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8057              :                                  "      - %p [%d] --> %p\n",
    8058         5026 :                                  (void *) other_vertex.node,
    8059              :                                  other_vertex.partition,
    8060         5026 :                                  (void *) vertex.node);
    8061              :               else
    8062         5026 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8063              :                                  "      - %p --> [%d] %p\n",
    8064         5026 :                                  (void *) vertex.node,
    8065              :                                  other_vertex.partition,
    8066         5026 :                                  (void *) other_vertex.node);
    8067        15041 :             };
    8068         4989 :           for_each_partition_edge (node_i, print_edge);
    8069              :         }
    8070              : 
    8071        14482 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    8072              :         {
    8073         9721 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    8074         9721 :           if (layout_costs.is_possible ())
    8075              :             {
    8076         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8077              :                                "    layout %d:%s\n", layout_i,
    8078         7976 :                                partition.layout == int (layout_i)
    8079              :                                ? " (*)" : "");
    8080         7976 :               slpg_layout_cost combined_cost = layout_costs.in_cost;
    8081         7976 :               combined_cost.add_serial_cost (layout_costs.internal_cost);
    8082         7976 :               combined_cost.add_serial_cost (layout_costs.out_cost);
    8083              : #define TEMPLATE "{depth: %f, total: %f}"
    8084         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8085              :                                "        " TEMPLATE "\n",
    8086              :                                layout_costs.in_cost.depth.to_double (),
    8087              :                                layout_costs.in_cost.total.to_double ());
    8088         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8089              :                                "      + " TEMPLATE "\n",
    8090              :                                layout_costs.internal_cost.depth.to_double (),
    8091              :                                layout_costs.internal_cost.total.to_double ());
    8092         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8093              :                                "      + " TEMPLATE "\n",
    8094              :                                layout_costs.out_cost.depth.to_double (),
    8095              :                                layout_costs.out_cost.total.to_double ());
    8096         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8097              :                                "      = " TEMPLATE "\n",
    8098              :                                combined_cost.depth.to_double (),
    8099              :                                combined_cost.total.to_double ());
    8100              : #undef TEMPLATE
    8101              :             }
    8102              :           else
    8103         1745 :             dump_printf_loc (MSG_NOTE, vect_location,
    8104              :                              "    layout %d: rejected\n", layout_i);
    8105              :         }
    8106              :     }
    8107          659 : }
    8108              : 
    8109              : /* Masked load lanes discovery.  */
    8110              : 
    8111              : void
    8112       624215 : vect_optimize_slp_pass::decide_masked_load_lanes ()
    8113              : {
    8114      6414676 :   for (auto v : m_vertices)
    8115              :     {
    8116      4542031 :       slp_tree node = v.node;
    8117      4542031 :       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    8118      3133636 :           || SLP_TREE_PERMUTE_P (node))
    8119      1539199 :         continue;
    8120      3002832 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    8121      1506472 :       if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
    8122              :           /* The mask has to be uniform.  */
    8123       949532 :           || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    8124       949401 :           || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    8125      3002917 :           || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    8126              :                                        IFN_MASK_LOAD))
    8127      3002799 :         continue;
    8128           33 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
    8129           66 :       if (STMT_VINFO_STRIDED_P (stmt_info)
    8130           33 :           || compare_step_with_zero (m_vinfo, stmt_info) <= 0
    8131           63 :           || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
    8132           30 :                                         DR_GROUP_SIZE (stmt_info),
    8133              :                                         true) == IFN_LAST)
    8134           33 :         continue;
    8135              : 
    8136              :       /* Uniform masks need to be suitably represented.  */
    8137            0 :       slp_tree mask = SLP_TREE_CHILDREN (node)[0];
    8138            0 :       if (!SLP_TREE_PERMUTE_P (mask)
    8139            0 :           || SLP_TREE_CHILDREN (mask).length () != 1)
    8140            0 :         continue;
    8141            0 :       bool match = true;
    8142            0 :       for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
    8143            0 :         if (perm.first != 0 || perm.second != 0)
    8144              :           {
    8145              :             match = false;
    8146              :             break;
    8147              :           }
    8148            0 :       if (!match)
    8149            0 :         continue;
    8150              : 
    8151              :       /* Now see if the consumer side matches.  */
    8152            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8153            0 :            pred; pred = pred->pred_next)
    8154              :         {
    8155            0 :           slp_tree pred_node = m_vertices[pred->src].node;
    8156              :           /* All consumers should be a permute with a single outgoing lane.  */
    8157            0 :           if (!SLP_TREE_PERMUTE_P (pred_node)
    8158            0 :               || SLP_TREE_LANES (pred_node) != 1)
    8159              :             {
    8160              :               match = false;
    8161              :               break;
    8162              :             }
    8163            0 :           gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
    8164              :         }
    8165            0 :       if (!match)
    8166            0 :         continue;
    8167              :       /* Now we can mark the nodes as to use load lanes.  */
    8168            0 :       node->ldst_lanes = true;
    8169            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8170            0 :            pred; pred = pred->pred_next)
    8171            0 :         m_vertices[pred->src].node->ldst_lanes = true;
    8172              :       /* The catch is we have to massage the mask.  We have arranged
    8173              :          analyzed uniform masks to be represented by a splat VEC_PERM
    8174              :          which we can now simply elide as we cannot easily re-do SLP
    8175              :          discovery here.  */
    8176            0 :       slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
    8177            0 :       SLP_TREE_REF_COUNT (new_mask)++;
    8178            0 :       SLP_TREE_CHILDREN (node)[0] = new_mask;
    8179            0 :       vect_free_slp_tree (mask);
    8180              :     }
    8181       624215 : }
    8182              : 
    8183              : /* Perform legitimizing attempts.  This is intended to improve the
    8184              :    situation when layout 0 is not valid which is a situation the cost
    8185              :    based propagation does not handle well.
    8186              :    Return true if further layout optimization is possible, false if
    8187              :    the layout configuration should be considered final.  */
    8188              : 
    8189              : bool
    8190        10117 : vect_optimize_slp_pass::legitimize ()
    8191              : {
    8192              :   /* Perform a very simple legitimizing attempt by attempting to choose
    8193              :      a single layout for all partitions that will make all permutations
    8194              :      a noop.  That should also be the optimal layout choice in case
    8195              :      layout zero is legitimate.
    8196              :      ???  Disconnected components of the SLP graph could have distinct
    8197              :      single layouts.  */
    8198        10117 :   int single_layout_i = -1;
    8199        10117 :   unsigned deferred_up_to = -1U;
    8200        30391 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8201              :        ++partition_i)
    8202              :     {
    8203        25519 :       auto &partition = m_partitions[partition_i];
    8204        25519 :       if (single_layout_i == -1)
    8205              :         {
    8206        13310 :           single_layout_i = partition.layout;
    8207        13310 :           deferred_up_to = partition_i;
    8208              :         }
    8209        12209 :       else if (partition.layout == single_layout_i || partition.layout == -1)
    8210              :         ;
    8211              :       else
    8212              :         single_layout_i = 0;
    8213        22323 :       if (single_layout_i == 0)
    8214              :         return true;
    8215              : 
    8216        20334 :       if (single_layout_i != -1
    8217        20334 :           && !is_compatible_layout (partition, single_layout_i))
    8218              :         return true;
    8219              :     }
    8220              : 
    8221         4872 :   if (single_layout_i <= 0)
    8222              :     return true;
    8223              : 
    8224         4988 :   for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
    8225          122 :     if (!is_compatible_layout (m_partitions[partition_i],
    8226              :                                single_layout_i))
    8227              :       return true;
    8228              : 
    8229        12105 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8230              :        ++partition_i)
    8231              :     {
    8232         7239 :       auto &partition = m_partitions[partition_i];
    8233         7239 :       partition.layout = single_layout_i;
    8234              :     }
    8235              : 
    8236              :   return false;
    8237              : }
    8238              : 
    8239              : /* Main entry point for the SLP graph optimization pass.  */
    8240              : 
    8241              : void
    8242       624215 : vect_optimize_slp_pass::run ()
    8243              : {
    8244       624215 :   build_graph ();
    8245       624215 :   create_partitions ();
    8246       624215 :   start_choosing_layouts ();
    8247       624215 :   if (m_perms.length () > 1)
    8248              :     {
    8249        10117 :       if (legitimize ())
    8250              :         {
    8251         5251 :           forward_pass ();
    8252         5251 :           backward_pass ();
    8253              :         }
    8254        10117 :       if (dump_enabled_p ())
    8255          659 :         dump ();
    8256        10117 :       materialize ();
    8257        40887 :       while (!m_perms.is_empty ())
    8258        20653 :         m_perms.pop ().release ();
    8259              :     }
    8260              :   else
    8261       614098 :     remove_redundant_permutations ();
    8262       624215 :   free_graph (m_slpg);
    8263       624215 :   build_graph ();
    8264       624215 :   decide_masked_load_lanes ();
    8265       624215 :   free_graph (m_slpg);
    8266       624215 : }
    8267              : 
    8268              : /* Apply CSE to NODE and its children using BST_MAP.  */
    8269              : 
    8270              : static void
    8271      4860761 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
    8272              : {
    8273      4860761 :   bool put_p = false;
    8274      4860761 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
    8275              :       /* Besides some VEC_PERM_EXPR, two-operator nodes also
    8276              :          lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
    8277              :          we'd have sth that works for all internal and external nodes.  */
    8278      4860761 :       && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8279              :     {
    8280      3430607 :       slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
    8281      3430607 :       if (leader)
    8282              :         {
    8283              :           /* We've visited this node already.  */
    8284       320594 :           if (!*leader || *leader == node)
    8285              :             return;
    8286              : 
    8287         2432 :           if (dump_enabled_p ())
    8288          887 :             dump_printf_loc (MSG_NOTE, vect_location,
    8289              :                              "re-using SLP tree %p for %p\n",
    8290              :                              (void *)*leader, (void *)node);
    8291         2432 :           vect_free_slp_tree (node);
    8292         2432 :           (*leader)->refcnt += 1;
    8293         2432 :           node = *leader;
    8294         2432 :           return;
    8295              :         }
    8296              : 
    8297              :       /* Avoid creating a cycle by populating the map only after recursion.  */
    8298      3110013 :       bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
    8299      3110013 :       node->refcnt += 1;
    8300      3110013 :       put_p = true;
    8301              :       /* And recurse.  */
    8302              :     }
    8303              : 
    8304     13392575 :   for (slp_tree &child : SLP_TREE_CHILDREN (node))
    8305      3859386 :     if (child)
    8306      3482475 :       vect_cse_slp_nodes (bst_map, child);
    8307              : 
    8308              :   /* Now record the node for CSE in other siblings.  */
    8309      4540167 :   if (put_p)
    8310      3110013 :     *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
    8311              : }
    8312              : 
    8313              : /* Optimize the SLP graph of VINFO.  */
    8314              : 
    8315              : void
    8316       967687 : vect_optimize_slp (vec_info *vinfo)
    8317              : {
    8318       967687 :   if (vinfo->slp_instances.is_empty ())
    8319              :     return;
    8320       624215 :   vect_optimize_slp_pass (vinfo).run ();
    8321              : 
    8322              :   /* Apply CSE again to nodes after permute optimization.  */
    8323       624215 :   scalar_stmts_to_slp_tree_map_t *bst_map
    8324       624215 :     = new scalar_stmts_to_slp_tree_map_t ();
    8325              : 
    8326      3250931 :   for (auto inst : vinfo->slp_instances)
    8327      1378286 :     vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
    8328              : 
    8329       624215 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    8330              : }
    8331              : 
    8332              : /* Gather loads reachable from the individual SLP graph entries.  */
    8333              : 
    8334              : void
    8335       967687 : vect_gather_slp_loads (vec_info *vinfo)
    8336              : {
    8337       967687 :   unsigned i;
    8338       967687 :   slp_instance instance;
    8339      2345973 :   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
    8340              :     {
    8341      1378286 :       hash_set<slp_tree> visited;
    8342      1378286 :       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
    8343              :                              SLP_INSTANCE_TREE (instance), visited);
    8344      1378286 :     }
    8345       967687 : }
    8346              : 
    8347              : /* For NODE update VF based on the number of lanes and the vector types
    8348              :    used.  */
    8349              : 
    8350              : static void
    8351      3578119 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
    8352              :                              hash_set<slp_tree> &visited)
    8353              : {
    8354      3578119 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    8355      1286504 :     return;
    8356      2565429 :   if (visited.add (node))
    8357              :     return;
    8358              : 
    8359      8652484 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    8360      2914315 :     vect_update_slp_vf_for_node (child, vf, visited);
    8361              : 
    8362              :   /* We do not visit SLP nodes for constants or externals - those neither
    8363              :      have a vector type set yet (vectorizable_* does this) nor do they
    8364              :      have max_nunits set.  Instead we rely on internal nodes max_nunit
    8365              :      to cover constant/external operands.
    8366              :      Note that when we stop using fixed size vectors externs and constants
    8367              :      shouldn't influence the (minimum) vectorization factor, instead
    8368              :      vectorizable_* should honor the vectorization factor when trying to
    8369              :      assign vector types to constants and externals and cause iteration
    8370              :      to a higher vectorization factor when required.  */
    8371      2291615 :   poly_uint64 node_vf
    8372      2291615 :     = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
    8373      2291615 :   vf = force_common_multiple (vf, node_vf);
    8374              : 
    8375              :   /* For permute nodes that are fed from externs or constants we have to
    8376              :      consider their number of lanes as well.  Likewise for store-lanes.  */
    8377      2291615 :   if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
    8378       645732 :     for (slp_tree child : SLP_TREE_CHILDREN (node))
    8379       171960 :       if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
    8380              :         {
    8381         2858 :           poly_uint64 child_vf
    8382         2858 :             = calculate_unrolling_factor (node->max_nunits,
    8383              :                                           SLP_TREE_LANES (child));
    8384         2858 :           vf = force_common_multiple (vf, child_vf);
    8385              :         }
    8386              : }
    8387              : 
    8388              : /* For each possible SLP instance decide whether to SLP it and calculate overall
    8389              :    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
    8390              :    least one instance.  */
    8391              : 
    8392              : bool
    8393       405174 : vect_make_slp_decision (loop_vec_info loop_vinfo)
    8394              : {
    8395       405174 :   unsigned int i;
    8396       405174 :   poly_uint64 unrolling_factor = 1;
    8397       405174 :   const vec<slp_instance> &slp_instances
    8398              :     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
    8399       405174 :   slp_instance instance;
    8400       405174 :   int decided_to_slp = 0;
    8401              : 
    8402       405174 :   DUMP_VECT_SCOPE ("vect_make_slp_decision");
    8403              : 
    8404       405174 :   hash_set<slp_tree> visited;
    8405      1068978 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    8406              :     {
    8407       663804 :       slp_tree root = SLP_INSTANCE_TREE (instance);
    8408              : 
    8409              :       /* All unroll factors have the form:
    8410              : 
    8411              :            GET_MODE_SIZE (vinfo->vector_mode) * X
    8412              : 
    8413              :          for some rational X, so they must have a common multiple.  */
    8414       663804 :       vect_update_slp_vf_for_node (root, unrolling_factor, visited);
    8415              : 
    8416              :       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
    8417              :          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
    8418              :          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
    8419       663804 :       vect_mark_slp_stmts (loop_vinfo, root);
    8420              : 
    8421              :       /* If all instances ended up with vector(1) T roots make sure to
    8422              :          not vectorize.  RVV for example relies on loop vectorization
    8423              :          when some instances are essentially kept scalar.  See PR121048.  */
    8424       663804 :       if (SLP_TREE_VECTYPE (root)
    8425       663804 :           && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
    8426       545564 :         decided_to_slp++;
    8427              :     }
    8428              : 
    8429       405174 :   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
    8430              : 
    8431       405174 :   if (decided_to_slp && dump_enabled_p ())
    8432              :     {
    8433        18394 :       dump_printf_loc (MSG_NOTE, vect_location,
    8434              :                        "Decided to SLP %d instances. Unrolling factor ",
    8435              :                        decided_to_slp);
    8436        18394 :       dump_dec (MSG_NOTE, unrolling_factor);
    8437        18394 :       dump_printf (MSG_NOTE, "\n");
    8438              :     }
    8439              : 
    8440       405174 :   return (decided_to_slp > 0);
    8441       405174 : }
    8442              : 
    8443              : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
    8444              : 
    8445      2197543 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
    8446              :   : vec_info (vec_info::bb, shared),
    8447      2197543 :     roots (vNULL)
    8448              : {
    8449              :   /* The region we are operating on.  bbs[0] is the entry, excluding
    8450              :      its PHI nodes.  In the future we might want to track an explicit
    8451              :      entry edge to cover bbs[0] PHI nodes and have a region entry
    8452              :      insert location.  */
    8453      2197543 :   bbs = _bbs.address ();
    8454      2197543 :   nbbs = _bbs.length ();
    8455              : 
    8456     17788150 :   for (unsigned i = 0; i < nbbs; ++i)
    8457              :     {
    8458     15590607 :       if (i != 0)
    8459     20343661 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8460      6950597 :              gsi_next (&si))
    8461              :           {
    8462      6950597 :             gphi *phi = si.phi ();
    8463      6950597 :             gimple_set_uid (phi, 0);
    8464      6950597 :             add_stmt (phi);
    8465              :           }
    8466     31181214 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8467    135793589 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8468              :         {
    8469    120202982 :           gimple *stmt = gsi_stmt (gsi);
    8470    120202982 :           gimple_set_uid (stmt, 0);
    8471    120202982 :           if (is_gimple_debug (stmt))
    8472     74907747 :             continue;
    8473     45295235 :           add_stmt (stmt);
    8474              :         }
    8475              :     }
    8476      2197543 : }
    8477              : 
    8478              : 
    8479              : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
    8480              :    stmts in the basic block.  */
    8481              : 
    8482      2197543 : _bb_vec_info::~_bb_vec_info ()
    8483              : {
    8484              :   /* Reset region marker.  */
    8485     17788150 :   for (unsigned i = 0; i < nbbs; ++i)
    8486              :     {
    8487     15590607 :       if (i != 0)
    8488     20359371 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8489      6966307 :              gsi_next (&si))
    8490              :           {
    8491      6966307 :             gphi *phi = si.phi ();
    8492      6966307 :             gimple_set_uid (phi, -1);
    8493              :           }
    8494     31181214 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8495    135735683 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8496              :         {
    8497    120145076 :           gimple *stmt = gsi_stmt (gsi);
    8498    120145076 :           gimple_set_uid (stmt, -1);
    8499              :         }
    8500              :     }
    8501              : 
    8502      3397632 :   for (unsigned i = 0; i < roots.length (); ++i)
    8503              :     {
    8504      1200089 :       roots[i].stmts.release ();
    8505      1200089 :       roots[i].roots.release ();
    8506      1200089 :       roots[i].remain.release ();
    8507              :     }
    8508      2197543 :   roots.release ();
    8509      2197543 : }
    8510              : 
    8511              : /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
    8512              :    given then that child nodes have already been processed, and that
    8513              :    their def types currently match their SLP node's def type.  */
    8514              : 
    8515              : static bool
    8516      2443693 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
    8517              :                                     slp_instance node_instance,
    8518              :                                     stmt_vector_for_cost *cost_vec)
    8519              : {
    8520              :   /* Handle purely internal nodes.  */
    8521      2443693 :   if (SLP_TREE_PERMUTE_P (node))
    8522              :     {
    8523        99267 :       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
    8524              :         return false;
    8525              : 
    8526              :       stmt_vec_info slp_stmt_info;
    8527              :       unsigned int i;
    8528       257030 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
    8529              :         {
    8530       159092 :           if (slp_stmt_info
    8531       154147 :               && STMT_VINFO_LIVE_P (slp_stmt_info)
    8532       159110 :               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
    8533              :                                                node_instance, i,
    8534              :                                                false, cost_vec))
    8535              :             return false;
    8536              :         }
    8537        97938 :       SLP_TREE_TYPE (node) = permute_info_type;
    8538        97938 :       return true;
    8539              :     }
    8540              : 
    8541      2344426 :   return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
    8542              : }
    8543              : 
    8544              : static int
    8545      1847209 : sort_ints (const void *a_, const void *b_)
    8546              : {
    8547      1847209 :   int a = *(const int *)a_;
    8548      1847209 :   int b = *(const int *)b_;
    8549      1847209 :   return a - b;
    8550              : }
    8551              : 
    8552              : /* Verify if we can externalize a set of internal defs.  */
    8553              : 
    8554              : static bool
    8555       378223 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
    8556              : {
    8557              :   /* Constant generation uses get_later_stmt which can only handle
    8558              :      defs from the same BB or a set of defs that can be ordered
    8559              :      with a dominance query.  */
    8560       378223 :   basic_block bb = NULL;
    8561       378223 :   bool all_same = true;
    8562       378223 :   auto_vec<int> bbs;
    8563       756446 :   bbs.reserve_exact (stmts.length ());
    8564      2044439 :   for (stmt_vec_info stmt : stmts)
    8565              :     {
    8566       909770 :       if (!stmt)
    8567              :         return false;
    8568       909770 :       else if (!bb)
    8569       378223 :         bb = gimple_bb (stmt->stmt);
    8570       531547 :       else if (gimple_bb (stmt->stmt) != bb)
    8571       172086 :         all_same = false;
    8572       909770 :       bbs.quick_push (gimple_bb (stmt->stmt)->index);
    8573              :     }
    8574       378223 :   if (all_same)
    8575              :     return true;
    8576              : 
    8577              :   /* Produce a vector of unique BB indexes for the defs.  */
    8578       128939 :   bbs.qsort (sort_ints);
    8579              :   unsigned i, j;
    8580       314176 :   for (i = 1, j = 1; i < bbs.length (); ++i)
    8581       185237 :     if (bbs[i] != bbs[j-1])
    8582       137884 :       bbs[j++] = bbs[i];
    8583       128939 :   gcc_assert (j >= 2);
    8584       128939 :   bbs.truncate (j);
    8585              : 
    8586       257878 :   if (bbs.length () == 2)
    8587       125399 :     return (dominated_by_p (CDI_DOMINATORS,
    8588       125399 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
    8589       125399 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
    8590       244135 :             || dominated_by_p (CDI_DOMINATORS,
    8591       118736 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
    8592       118736 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
    8593              : 
    8594              :   /* ???  For more than two BBs we can sort the vector and verify the
    8595              :      result is a total order.  But we can't use vec::qsort with a
    8596              :      compare function using a dominance query since there's no way to
    8597              :      signal failure and any fallback for an unordered pair would
    8598              :      fail qsort_chk later.
    8599              :      For now simply hope that ordering after BB index provides the
    8600              :      best candidate total order.  If required we can implement our
    8601              :      own mergesort or export an entry without checking.  */
    8602       394195 :   for (unsigned i = 1; i < bbs.length (); ++i)
    8603        12461 :     if (!dominated_by_p (CDI_DOMINATORS,
    8604        12461 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
    8605        12461 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
    8606              :       return false;
    8607              : 
    8608              :   return true;
    8609       378223 : }
    8610              : 
    8611              : /* Try to build NODE from scalars, returning true on success.
    8612              :    NODE_INSTANCE is the SLP instance that contains NODE.  */
    8613              : 
    8614              : static bool
    8615       526610 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
    8616              :                               slp_instance node_instance)
    8617              : {
    8618       526610 :   stmt_vec_info stmt_info;
    8619       526610 :   unsigned int i;
    8620              : 
    8621       526610 :   if (!is_a <bb_vec_info> (vinfo)
    8622        70510 :       || node == SLP_INSTANCE_TREE (node_instance)
    8623        22182 :       || !SLP_TREE_SCALAR_STMTS (node).exists ()
    8624        22141 :       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
    8625              :       /* Force the mask use to be built from scalars instead.  */
    8626        19941 :       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
    8627       546336 :       || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
    8628       506884 :     return false;
    8629              : 
    8630        19726 :   if (dump_enabled_p ())
    8631           70 :     dump_printf_loc (MSG_NOTE, vect_location,
    8632              :                      "Building vector operands of %p from scalars instead\n",
    8633              :                      (void *) node);
    8634              : 
    8635              :   /* Don't remove and free the child nodes here, since they could be
    8636              :      referenced by other structures.  The analysis and scheduling phases
    8637              :      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
    8638        19726 :   unsigned int group_size = SLP_TREE_LANES (node);
    8639        19726 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
    8640              :   /* Invariants get their vector type from the uses.  */
    8641        19726 :   SLP_TREE_VECTYPE (node) = NULL_TREE;
    8642        19726 :   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
    8643        19726 :   SLP_TREE_LOAD_PERMUTATION (node).release ();
    8644        68630 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    8645              :     {
    8646        48904 :       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
    8647        48904 :       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
    8648              :     }
    8649              :   return true;
    8650              : }
    8651              : 
    8652              : /* Return true if all elements of the slice are the same.  */
    8653              : bool
    8654       452526 : vect_scalar_ops_slice::all_same_p () const
    8655              : {
    8656       499858 :   for (unsigned int i = 1; i < length; ++i)
    8657       421723 :     if (!operand_equal_p (op (0), op (i)))
    8658              :       return false;
    8659              :   return true;
    8660              : }
    8661              : 
    8662              : hashval_t
    8663       392695 : vect_scalar_ops_slice_hash::hash (const value_type &s)
    8664              : {
    8665       392695 :   hashval_t hash = 0;
    8666      1517292 :   for (unsigned i = 0; i < s.length; ++i)
    8667      1124597 :     hash = iterative_hash_expr (s.op (i), hash);
    8668       392695 :   return hash;
    8669              : }
    8670              : 
    8671              : bool
    8672       214115 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
    8673              :                                    const compare_type &s2)
    8674              : {
    8675       214115 :   if (s1.length != s2.length)
    8676              :     return false;
    8677       371814 :   for (unsigned i = 0; i < s1.length; ++i)
    8678       324860 :     if (!operand_equal_p (s1.op (i), s2.op (i)))
    8679              :       return false;
    8680              :   return true;
    8681              : }
    8682              : 
    8683              : /* Compute the prologue cost for invariant or constant operands represented
    8684              :    by NODE.  */
    8685              : 
    8686              : static void
    8687      1037089 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
    8688              :                             stmt_vector_for_cost *cost_vec)
    8689              : {
    8690              :   /* There's a special case of an existing vector, that costs nothing.  */
    8691      1037089 :   if (SLP_TREE_SCALAR_OPS (node).length () == 0
    8692      1037089 :       && !SLP_TREE_VEC_DEFS (node).is_empty ())
    8693         1570 :     return;
    8694              :   /* Without looking at the actual initializer a vector of
    8695              :      constants can be implemented as load from the constant pool.
    8696              :      When all elements are the same we can use a splat.  */
    8697      1035519 :   tree vectype = SLP_TREE_VECTYPE (node);
    8698      1035519 :   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
    8699      1035519 :   unsigned HOST_WIDE_INT const_nunits;
    8700      1035519 :   unsigned nelt_limit;
    8701      1035519 :   unsigned nvectors = vect_get_num_copies (vinfo, node);
    8702      1035519 :   auto ops = &SLP_TREE_SCALAR_OPS (node);
    8703      1035519 :   auto_vec<unsigned int> starts (nvectors);
    8704      1035519 :   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
    8705      1035519 :       && ! multiple_p (const_nunits, group_size))
    8706              :     {
    8707        62478 :       nelt_limit = const_nunits;
    8708        62478 :       hash_set<vect_scalar_ops_slice_hash> vector_ops;
    8709       258465 :       for (unsigned int i = 0; i < nvectors; ++i)
    8710       195987 :         if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
    8711       149033 :           starts.quick_push (i * nelt_limit);
    8712        62478 :     }
    8713              :   else
    8714              :     {
    8715              :       /* If either the vector has variable length or the vectors
    8716              :          are composed of repeated whole groups we only need to
    8717              :          cost construction once.  All vectors will be the same.  */
    8718       973041 :       nelt_limit = group_size;
    8719       973041 :       starts.quick_push (0);
    8720              :     }
    8721              :   /* ???  We're just tracking whether vectors in a single node are the same.
    8722              :      Ideally we'd do something more global.  */
    8723      1035519 :   bool passed = false;
    8724      4228631 :   for (unsigned int start : starts)
    8725              :     {
    8726      1122074 :       vect_cost_for_stmt kind;
    8727      1122074 :       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
    8728              :         kind = vector_load;
    8729       452526 :       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
    8730              :         kind = scalar_to_vec;
    8731              :       else
    8732       374391 :         kind = vec_construct;
    8733              :       /* The target cost hook has no idea which part of the SLP node
    8734              :          we are costing so avoid passing it down more than once.  Pass
    8735              :          it to the first vec_construct or scalar_to_vec part since for those
    8736              :          the x86 backend tries to account for GPR to XMM register moves.  */
    8737      1122074 :       record_stmt_cost (cost_vec, 1, kind, nullptr,
    8738      1122074 :                         (kind != vector_load && !passed) ? node : nullptr,
    8739              :                         vectype, 0, vect_prologue);
    8740      1122074 :       if (kind != vector_load)
    8741       452526 :         passed = true;
    8742              :     }
    8743      1035519 : }
    8744              : 
    8745              : /* Analyze statements contained in SLP tree NODE after recursively analyzing
    8746              :    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
    8747              : 
    8748              :    Return true if the operations are supported.  */
    8749              : 
    8750              : static bool
    8751      4547216 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
    8752              :                                   slp_instance node_instance,
    8753              :                                   hash_set<slp_tree> &visited_set,
    8754              :                                   vec<slp_tree> &visited_vec,
    8755              :                                   stmt_vector_for_cost *cost_vec)
    8756              : {
    8757      4547216 :   int i, j;
    8758      4547216 :   slp_tree child;
    8759              : 
    8760              :   /* Assume we can code-generate all invariants.  */
    8761      4547216 :   if (!node
    8762      4225463 :       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
    8763      3507010 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
    8764              :     return true;
    8765              : 
    8766      3002591 :   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
    8767              :     {
    8768            9 :       if (dump_enabled_p ())
    8769            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    8770              :                          "Failed cyclic SLP reference in %p\n", (void *) node);
    8771            9 :       return false;
    8772              :     }
    8773      3002582 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
    8774              : 
    8775              :   /* If we already analyzed the exact same set of scalar stmts we're done.
    8776              :      We share the generated vector stmts for those.  */
    8777      3002582 :   if (visited_set.add (node))
    8778              :     return true;
    8779      2733581 :   visited_vec.safe_push (node);
    8780              : 
    8781      2733581 :   bool res = true;
    8782      2733581 :   unsigned visited_rec_start = visited_vec.length ();
    8783      2733581 :   unsigned cost_vec_rec_start = cost_vec->length ();
    8784      2733581 :   bool seen_non_constant_child = false;
    8785      5767069 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    8786              :     {
    8787      3323151 :       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
    8788              :                                               visited_set, visited_vec,
    8789              :                                               cost_vec);
    8790      3323151 :       if (!res)
    8791              :         break;
    8792      3033488 :       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
    8793      3033488 :         seen_non_constant_child = true;
    8794              :     }
    8795              :   /* We're having difficulties scheduling nodes with just constant
    8796              :      operands and no scalar stmts since we then cannot compute a stmt
    8797              :      insertion place.  */
    8798      2733581 :   if (res
    8799      2733581 :       && !seen_non_constant_child
    8800      2733581 :       && SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8801              :     {
    8802          225 :       if (dump_enabled_p ())
    8803            6 :         dump_printf_loc (MSG_NOTE, vect_location,
    8804              :                          "Cannot vectorize all-constant op node %p\n",
    8805              :                          (void *) node);
    8806              :       res = false;
    8807              :     }
    8808              : 
    8809      2733356 :   if (res)
    8810      2443693 :     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
    8811              :                                               cost_vec);
    8812              :   /* If analysis failed we have to pop all recursive visited nodes
    8813              :      plus ourselves.  */
    8814      2733581 :   if (!res)
    8815              :     {
    8816      2634270 :       while (visited_vec.length () >= visited_rec_start)
    8817       790525 :         visited_set.remove (visited_vec.pop ());
    8818       526610 :       cost_vec->truncate (cost_vec_rec_start);
    8819              :     }
    8820              : 
    8821              :   /* When the node can be vectorized cost invariant nodes it references.
    8822              :      This is not done in DFS order to allow the referring node
    8823              :      vectorizable_* calls to nail down the invariant nodes vector type
    8824              :      and possibly unshare it if it needs a different vector type than
    8825              :      other referrers.  */
    8826      2733581 :   if (res)
    8827      4940693 :     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
    8828      2733722 :       if (child
    8829      2475987 :           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
    8830      2475987 :               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
    8831              :           /* Perform usual caching, note code-generation still
    8832              :              code-gens these nodes multiple times but we expect
    8833              :              to CSE them later.  */
    8834      3836425 :           && !visited_set.add (child))
    8835              :         {
    8836      1078067 :           visited_vec.safe_push (child);
    8837              :           /* ???  After auditing more code paths make a "default"
    8838              :              and push the vector type from NODE to all children
    8839              :              if it is not already set.  */
    8840              :           /* Compute the number of vectors to be generated.  */
    8841      1078067 :           tree vector_type = SLP_TREE_VECTYPE (child);
    8842      1078067 :           if (!vector_type)
    8843              :             {
    8844              :               /* Masked loads can have an undefined (default SSA definition)
    8845              :                  else operand.  We do not need to cost it.  */
    8846        40978 :               vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
    8847        42028 :               if (SLP_TREE_TYPE (node) == load_vec_info_type
    8848        42028 :                   && ((ops.length ()
    8849         1050 :                        && TREE_CODE (ops[0]) == SSA_NAME
    8850            0 :                        && SSA_NAME_IS_DEFAULT_DEF (ops[0])
    8851            0 :                        && VAR_P (SSA_NAME_VAR (ops[0])))
    8852         1050 :                       || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
    8853         1050 :                 continue;
    8854              : 
    8855              :               /* For shifts with a scalar argument we don't need
    8856              :                  to cost or code-generate anything.
    8857              :                  ???  Represent this more explicitly.  */
    8858        39928 :               gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
    8859              :                           && j == 1);
    8860        39928 :               continue;
    8861        39928 :             }
    8862              : 
    8863              :           /* And cost them.  */
    8864      1037089 :           vect_prologue_cost_for_slp (vinfo, child, cost_vec);
    8865              :         }
    8866              : 
    8867              :   /* If this node or any of its children can't be vectorized, try pruning
    8868              :      the tree here rather than felling the whole thing.  */
    8869       526610 :   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
    8870              :     {
    8871              :       /* We'll need to revisit this for invariant costing and number
    8872              :          of vectorized stmt setting.   */
    8873              :       res = true;
    8874              :     }
    8875              : 
    8876              :   return res;
    8877              : }
    8878              : 
    8879              : /* Given a definition DEF, analyze if it will have any live scalar use after
    8880              :    performing SLP vectorization whose information is represented by BB_VINFO,
    8881              :    and record result into hash map SCALAR_USE_MAP as cache for later fast
    8882              :    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
    8883              :    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
    8884              :    means recursion is limited.  */
    8885              : 
    8886              : static int
    8887       564445 : vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
    8888              :                         hash_map<tree, int> &scalar_use_map,
    8889              :                         int depth = 0)
    8890              : {
    8891       564445 :   const int depth_limit = 3;
    8892       564445 :   imm_use_iterator use_iter;
    8893       564445 :   gimple *use_stmt;
    8894              : 
    8895       564445 :   if (int *res = scalar_use_map.get (def))
    8896        24387 :     return *res;
    8897              : 
    8898       540058 :   int scalar_use = 1;
    8899              : 
    8900      1778066 :   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
    8901              :     {
    8902       818791 :       if (is_gimple_debug (use_stmt))
    8903       183018 :         continue;
    8904              : 
    8905       635773 :       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
    8906              : 
    8907       635773 :       if (!use_stmt_info)
    8908              :         break;
    8909              : 
    8910       638363 :       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
    8911       511275 :         continue;
    8912              : 
    8913              :       /* Do not step forward when encounter PHI statement, since it may
    8914              :          involve cyclic reference and cause infinite recursive invocation.  */
    8915       118427 :       if (gimple_code (use_stmt) == GIMPLE_PHI)
    8916              :         break;
    8917              : 
    8918              :       /* When pattern recognition is involved, a statement whose definition is
    8919              :          consumed in some pattern, may not be included in the final replacement
    8920              :          pattern statements, so would be skipped when building SLP graph.
    8921              : 
    8922              :          * Original
    8923              :           char a_c = *(char *) a;
    8924              :           char b_c = *(char *) b;
    8925              :           unsigned short a_s = (unsigned short) a_c;
    8926              :           int a_i = (int) a_s;
    8927              :           int b_i = (int) b_c;
    8928              :           int r_i = a_i - b_i;
    8929              : 
    8930              :          * After pattern replacement
    8931              :           a_s = (unsigned short) a_c;
    8932              :           a_i = (int) a_s;
    8933              : 
    8934              :           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
    8935              :           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
    8936              : 
    8937              :           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
    8938              :           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
    8939              : 
    8940              :          The definitions of a_i(original statement) and b_i(pattern statement)
    8941              :          are related to, but actually not part of widen_minus pattern.
    8942              :          Vectorizing the pattern does not cause these definition statements to
    8943              :          be marked as PURE_SLP.  For this case, we need to recursively check
    8944              :          whether their uses are all absorbed into vectorized code.  But there
    8945              :          is an exception that some use may participate in an vectorized
    8946              :          operation via an external SLP node containing that use as an element.
    8947              :          The parameter "scalar_use_map" tags such kind of SSA as having scalar
    8948              :          use in advance.  */
    8949        99168 :       tree lhs = gimple_get_lhs (use_stmt);
    8950              : 
    8951        99168 :       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
    8952              :         break;
    8953              : 
    8954        65061 :       if (depth_limit && depth >= depth_limit)
    8955         7473 :         return -1;
    8956              : 
    8957        57588 :       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
    8958              :                                                 depth + 1)))
    8959              :         break;
    8960         7473 :     }
    8961              : 
    8962       532585 :   if (end_imm_use_stmt_p (&use_iter))
    8963       419217 :     scalar_use = 0;
    8964              : 
    8965              :   /* If recursion is limited, do not cache result for non-root defs.  */
    8966       532585 :   if (!depth || scalar_use >= 0)
    8967              :     {
    8968       517639 :       bool added = scalar_use_map.put (def, scalar_use);
    8969       517639 :       gcc_assert (!added);
    8970              :     }
    8971              : 
    8972       532585 :   return scalar_use;
    8973              : }
    8974              : 
    8975              : /* Mark lanes of NODE that are live outside of the basic-block vectorized
    8976              :    region and that can be vectorized using vectorizable_live_operation
    8977              :    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
    8978              :    scalar code computing it to be retained.  */
    8979              : 
    8980              : static void
    8981       904003 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
    8982              :                              slp_instance instance,
    8983              :                              stmt_vector_for_cost *cost_vec,
    8984              :                              hash_map<tree, int> &scalar_use_map,
    8985              :                              hash_set<stmt_vec_info> &svisited,
    8986              :                              hash_set<slp_tree> &visited)
    8987              : {
    8988       904003 :   if (visited.add (node))
    8989        41342 :     return;
    8990              : 
    8991       862661 :   unsigned i;
    8992       862661 :   stmt_vec_info stmt_info;
    8993       862661 :   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
    8994      3118937 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    8995              :     {
    8996      2256276 :       if (!stmt_info || svisited.contains (stmt_info))
    8997        30133 :         continue;
    8998      2234668 :       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
    8999      2234668 :       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
    9000        11427 :           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
    9001              :         /* Only the pattern root stmt computes the original scalar value.  */
    9002         8525 :         continue;
    9003      2226143 :       bool mark_visited = true;
    9004      2226143 :       gimple *orig_stmt = orig_stmt_info->stmt;
    9005      2226143 :       ssa_op_iter op_iter;
    9006      2226143 :       def_operand_p def_p;
    9007      4959143 :       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
    9008              :         {
    9009       506857 :           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
    9010              :                                       scalar_use_map))
    9011              :             {
    9012        90885 :               STMT_VINFO_LIVE_P (stmt_info) = true;
    9013        90885 :               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
    9014              :                                                instance, i, false, cost_vec))
    9015              :                 /* ???  So we know we can vectorize the live stmt from one SLP
    9016              :                    node.  If we cannot do so from all or none consistently
    9017              :                    we'd have to record which SLP node (and lane) we want to
    9018              :                    use for the live operation.  So make sure we can
    9019              :                    code-generate from all nodes.  */
    9020              :                 mark_visited = false;
    9021              :               else
    9022            0 :                 STMT_VINFO_LIVE_P (stmt_info) = false;
    9023              :             }
    9024              : 
    9025              :           /* We have to verify whether we can insert the lane extract
    9026              :              before all uses.  The following is a conservative approximation.
    9027              :              We cannot put this into vectorizable_live_operation because
    9028              :              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
    9029              :              doesn't work.
    9030              :              Note that while the fact that we emit code for loads at the
    9031              :              first load should make this a non-problem leafs we construct
    9032              :              from scalars are vectorized after the last scalar def.
    9033              :              ???  If we'd actually compute the insert location during
    9034              :              analysis we could use sth less conservative than the last
    9035              :              scalar stmt in the node for the dominance check.  */
    9036              :           /* ???  What remains is "live" uses in vector CTORs in the same
    9037              :              SLP graph which is where those uses can end up code-generated
    9038              :              right after their definition instead of close to their original
    9039              :              use.  But that would restrict us to code-generate lane-extracts
    9040              :              from the latest stmt in a node.  So we compensate for this
    9041              :              during code-generation, simply not replacing uses for those
    9042              :              hopefully rare cases.  */
    9043       506857 :           imm_use_iterator use_iter;
    9044       506857 :           gimple *use_stmt;
    9045       506857 :           stmt_vec_info use_stmt_info;
    9046              : 
    9047       506857 :           if (STMT_VINFO_LIVE_P (stmt_info))
    9048       613505 :             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
    9049       431735 :               if (!is_gimple_debug (use_stmt)
    9050       323347 :                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
    9051       313925 :                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
    9052       610581 :                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
    9053              :                 {
    9054        17397 :                   if (dump_enabled_p ())
    9055           57 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9056              :                                      "Cannot determine insertion place for "
    9057              :                                      "lane extract\n");
    9058        17397 :                   STMT_VINFO_LIVE_P (stmt_info) = false;
    9059        17397 :                   mark_visited = true;
    9060        90885 :                 }
    9061              :         }
    9062      2226143 :       if (mark_visited)
    9063      2149729 :         svisited.add (stmt_info);
    9064              :     }
    9065              : 
    9066              :   slp_tree child;
    9067      2495951 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9068       874426 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9069       229984 :       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
    9070              :                                    scalar_use_map, svisited, visited);
    9071              : }
    9072              : 
    9073              : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
    9074              :    are live outside of the basic-block vectorized region and that can be
    9075              :    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
    9076              : 
    9077              : static void
    9078       263665 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
    9079              : {
    9080       263665 :   if (bb_vinfo->slp_instances.is_empty ())
    9081        29789 :     return;
    9082              : 
    9083       233876 :   hash_set<stmt_vec_info> svisited;
    9084       233876 :   hash_set<slp_tree> visited;
    9085       233876 :   hash_map<tree, int> scalar_use_map;
    9086       233876 :   auto_vec<slp_tree> worklist;
    9087              : 
    9088      1375647 :   for (slp_instance instance : bb_vinfo->slp_instances)
    9089              :     {
    9090       674019 :       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
    9091        58699 :         for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
    9092        16748 :           if (TREE_CODE (op) == SSA_NAME)
    9093        14100 :             scalar_use_map.put (op, 1);
    9094       674019 :       if (!visited.add (SLP_INSTANCE_TREE (instance)))
    9095       671925 :         worklist.safe_push (SLP_INSTANCE_TREE (instance));
    9096              :     }
    9097              : 
    9098      1505754 :   do
    9099              :     {
    9100      1505754 :       slp_tree node = worklist.pop ();
    9101              : 
    9102      1505754 :       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
    9103              :         {
    9104      1542075 :           for (tree op : SLP_TREE_SCALAR_OPS (node))
    9105       680932 :             if (TREE_CODE (op) == SSA_NAME)
    9106       458685 :               scalar_use_map.put (op, 1);
    9107              :         }
    9108              :       else
    9109              :         {
    9110      3610303 :           for (slp_tree child : SLP_TREE_CHILDREN (node))
    9111       874402 :             if (child && !visited.add (child))
    9112       833829 :               worklist.safe_push (child);
    9113              :         }
    9114              :     }
    9115      3011508 :   while (!worklist.is_empty ());
    9116              : 
    9117       233876 :   visited.empty ();
    9118              : 
    9119      1375647 :   for (slp_instance instance : bb_vinfo->slp_instances)
    9120              :     {
    9121       674019 :       vect_location = instance->location ();
    9122       674019 :       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
    9123              :                                    instance, &instance->cost_vec,
    9124              :                                    scalar_use_map, svisited, visited);
    9125              :     }
    9126       233876 : }
    9127              : 
    9128              : /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
    9129              : 
    9130              : static bool
    9131        73949 : vectorizable_bb_reduc_epilogue (slp_instance instance,
    9132              :                                 stmt_vector_for_cost *cost_vec)
    9133              : {
    9134        73949 :   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
    9135        73949 :   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
    9136        73949 :   if (reduc_code == MINUS_EXPR)
    9137            0 :     reduc_code = PLUS_EXPR;
    9138        73949 :   internal_fn reduc_fn;
    9139        73949 :   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
    9140        73949 :   if (!vectype
    9141        73937 :       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
    9142        73937 :       || reduc_fn == IFN_LAST
    9143        73937 :       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
    9144       108848 :       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    9145        34899 :                                      TREE_TYPE (vectype)))
    9146              :     {
    9147        49363 :       if (dump_enabled_p ())
    9148          271 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9149              :                          "not vectorized: basic block reduction epilogue "
    9150              :                          "operation unsupported.\n");
    9151        49363 :       return false;
    9152              :     }
    9153              : 
    9154              :   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
    9155              :      cost log2 vector operations plus shuffles and one extraction.  */
    9156        24586 :   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
    9157        24586 :   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
    9158              :                     vectype, 0, vect_body);
    9159        24586 :   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
    9160              :                     vectype, 0, vect_body);
    9161        24586 :   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
    9162              :                     vectype, 0, vect_body);
    9163              : 
    9164              :   /* Since we replace all stmts of a possibly longer scalar reduction
    9165              :      chain account for the extra scalar stmts for that.  */
    9166        24586 :   if (!instance->remain_defs.is_empty ())
    9167        19790 :     record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
    9168         9895 :                       instance->root_stmts[0], 0, vect_body);
    9169              :   return true;
    9170              : }
    9171              : 
    9172              : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
    9173              :    and recurse to children.  */
    9174              : 
    9175              : static void
    9176       182495 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
    9177              :                               hash_set<slp_tree> &visited)
    9178              : {
    9179       182495 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    9180       182495 :       || visited.add (node))
    9181        81051 :     return;
    9182              : 
    9183              :   stmt_vec_info stmt;
    9184              :   unsigned i;
    9185       344026 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
    9186       242582 :     if (stmt)
    9187       246336 :       roots.remove (vect_orig_stmt (stmt));
    9188              : 
    9189              :   slp_tree child;
    9190       225162 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9191       123718 :     if (child)
    9192       122342 :       vect_slp_prune_covered_roots (child, roots, visited);
    9193              : }
    9194              : 
    9195              : /* Analyze statements in SLP instances of VINFO.  Return true if the
    9196              :    operations are supported. */
    9197              : 
    9198              : bool
    9199       605121 : vect_slp_analyze_operations (vec_info *vinfo)
    9200              : {
    9201       605121 :   slp_instance instance;
    9202       605121 :   int i;
    9203              : 
    9204       605121 :   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
    9205              : 
    9206       605121 :   hash_set<slp_tree> visited;
    9207      1600395 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9208              :     {
    9209      1224065 :       auto_vec<slp_tree> visited_vec;
    9210      1224065 :       stmt_vector_for_cost cost_vec;
    9211      1224065 :       cost_vec.create (2);
    9212      1224065 :       if (is_a <bb_vec_info> (vinfo))
    9213       773228 :         vect_location = instance->location ();
    9214      1224065 :       if (!vect_slp_analyze_node_operations (vinfo,
    9215              :                                              SLP_INSTANCE_TREE (instance),
    9216              :                                              instance, visited, visited_vec,
    9217              :                                              &cost_vec)
    9218              :           /* CTOR instances require vectorized defs for the SLP tree root.  */
    9219      1006835 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
    9220         5236 :               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
    9221              :                   != vect_internal_def
    9222              :                   /* Make sure we vectorized with the expected type.  */
    9223         5236 :                   || !useless_type_conversion_p
    9224         5236 :                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
    9225              :                                               (instance->root_stmts[0]->stmt))),
    9226         5236 :                          TREE_TYPE (SLP_TREE_VECTYPE
    9227              :                                             (SLP_INSTANCE_TREE (instance))))))
    9228              :           /* Check we can vectorize the reduction.  */
    9229      1006820 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
    9230        73949 :               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
    9231              :           /* Check we can vectorize the gcond.  */
    9232      2181522 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
    9233        61946 :               && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
    9234        61946 :                                            SLP_INSTANCE_ROOT_STMTS (instance)[0],
    9235              :                                            NULL,
    9236              :                                            SLP_INSTANCE_TREE (instance),
    9237              :                                            &cost_vec)))
    9238              :         {
    9239       326497 :           cost_vec.release ();
    9240       326497 :           slp_tree node = SLP_INSTANCE_TREE (instance);
    9241       326497 :           stmt_vec_info stmt_info;
    9242       326497 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9243       253981 :             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9244        72516 :           else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
    9245        72516 :                    && SLP_TREE_SCALAR_STMTS (node)[0])
    9246              :             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
    9247              :           else
    9248            0 :             stmt_info = SLP_TREE_REPRESENTATIVE (node);
    9249       326497 :           if (is_a <loop_vec_info> (vinfo))
    9250              :             {
    9251       228791 :               if (dump_enabled_p ())
    9252         6319 :                 dump_printf_loc (MSG_NOTE, vect_location,
    9253              :                                  "unsupported SLP instance starting from: %G",
    9254              :                                  stmt_info->stmt);
    9255       228791 :               return false;
    9256              :             }
    9257        97706 :           if (dump_enabled_p ())
    9258          325 :             dump_printf_loc (MSG_NOTE, vect_location,
    9259              :                              "removing SLP instance operations starting from: %G",
    9260              :                              stmt_info->stmt);
    9261       435712 :           while (!visited_vec.is_empty ())
    9262              :             {
    9263       338006 :               slp_tree node = visited_vec.pop ();
    9264       338006 :               SLP_TREE_TYPE (node) = undef_vec_info_type;
    9265       338006 :               if (node->data)
    9266              :                 {
    9267        12285 :                   delete node->data;
    9268        12285 :                   node->data = nullptr;
    9269              :                 }
    9270       338006 :               visited.remove (node);
    9271              :             }
    9272        97706 :           vect_free_slp_instance (instance);
    9273        97706 :           vinfo->slp_instances.ordered_remove (i);
    9274              :         }
    9275              :       else
    9276              :         {
    9277       897568 :           i++;
    9278       897568 :           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
    9279              :             {
    9280       222046 :               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
    9281       222046 :               cost_vec.release ();
    9282              :             }
    9283              :           else
    9284              :             /* For BB vectorization remember the SLP graph entry
    9285              :                cost for later.  */
    9286       675522 :             instance->cost_vec = cost_vec;
    9287              :         }
    9288      1224065 :     }
    9289              : 
    9290              :   /* Now look for SLP instances with a root that are covered by other
    9291              :      instances and remove them.  */
    9292       376330 :   hash_set<stmt_vec_info> roots;
    9293      1581961 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9294       861153 :     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9295        31852 :       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
    9296       376330 :   if (!roots.is_empty ())
    9297              :     {
    9298        12368 :       visited.empty ();
    9299        72521 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9300        60153 :         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
    9301              :                                       visited);
    9302        72521 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9303        60153 :         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
    9304        31852 :             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
    9305              :           {
    9306         1503 :             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9307         1503 :             if (dump_enabled_p ())
    9308           20 :               dump_printf_loc (MSG_NOTE, vect_location,
    9309              :                                "removing SLP instance operations starting "
    9310              :                                "from: %G", root->stmt);
    9311         1503 :             vect_free_slp_instance (instance);
    9312         1503 :             vinfo->slp_instances.ordered_remove (i);
    9313              :           }
    9314              :         else
    9315        58650 :           ++i;
    9316              :     }
    9317              : 
    9318              :   /* Compute vectorizable live stmts.  */
    9319       376330 :   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
    9320       263665 :     vect_bb_slp_mark_live_stmts (bb_vinfo);
    9321              : 
    9322       752660 :   return !vinfo->slp_instances.is_empty ();
    9323       981451 : }
    9324              : 
    9325              : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
    9326              :    closing the eventual chain.  */
    9327              : 
    9328              : static slp_instance
    9329       737632 : get_ultimate_leader (slp_instance instance,
    9330              :                      hash_map<slp_instance, slp_instance> &instance_leader)
    9331              : {
    9332       737632 :   auto_vec<slp_instance *, 8> chain;
    9333       737632 :   slp_instance *tem;
    9334       812980 :   while (*(tem = instance_leader.get (instance)) != instance)
    9335              :     {
    9336        75348 :       chain.safe_push (tem);
    9337        75348 :       instance = *tem;
    9338              :     }
    9339       812980 :   while (!chain.is_empty ())
    9340        75348 :     *chain.pop () = instance;
    9341       737632 :   return instance;
    9342       737632 : }
    9343              : 
    9344              : namespace {
    9345              : /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
    9346              :    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
    9347              :    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
    9348              : 
    9349              :    INSTANCE_LEADER is as for get_ultimate_leader.  */
    9350              : 
    9351              : template<typename T>
    9352              : bool
    9353      3261123 : vect_map_to_instance (slp_instance instance, T key,
    9354              :                       hash_map<T, slp_instance> &key_to_instance,
    9355              :                       hash_map<slp_instance, slp_instance> &instance_leader)
    9356              : {
    9357              :   bool existed_p;
    9358      3261123 :   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
    9359      3261123 :   if (!existed_p)
    9360              :     ;
    9361       172786 :   else if (key_instance != instance)
    9362              :     {
    9363              :       /* If we're running into a previously marked key make us the
    9364              :          leader of the current ultimate leader.  This keeps the
    9365              :          leader chain acyclic and works even when the current instance
    9366              :          connects two previously independent graph parts.  */
    9367        63613 :       slp_instance key_leader
    9368        63613 :         = get_ultimate_leader (key_instance, instance_leader);
    9369        63613 :       if (key_leader != instance)
    9370        19144 :         instance_leader.put (key_leader, instance);
    9371              :     }
    9372      3261123 :   key_instance = instance;
    9373      3261123 :   return existed_p;
    9374              : }
    9375              : }
    9376              : 
    9377              : /* Worker of vect_bb_partition_graph, recurse on NODE.  */
    9378              : 
    9379              : static void
    9380       904003 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
    9381              :                            slp_instance instance, slp_tree node,
    9382              :                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
    9383              :                            hash_map<slp_tree, slp_instance> &node_to_instance,
    9384              :                            hash_map<slp_instance, slp_instance> &instance_leader)
    9385              : {
    9386       904003 :   stmt_vec_info stmt_info;
    9387       904003 :   unsigned i;
    9388              : 
    9389      3261123 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9390      2357120 :     if (stmt_info)
    9391      2357120 :       vect_map_to_instance (instance, stmt_info, stmt_to_instance,
    9392              :                             instance_leader);
    9393              : 
    9394       904003 :   if (vect_map_to_instance (instance, node, node_to_instance,
    9395              :                             instance_leader))
    9396       904003 :     return;
    9397              : 
    9398              :   slp_tree child;
    9399      1737087 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9400       874426 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9401       229984 :       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
    9402              :                                  node_to_instance, instance_leader);
    9403              : }
    9404              : 
    9405              : /* Partition the SLP graph into pieces that can be costed independently.  */
    9406              : 
    9407              : static void
    9408       233876 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
    9409              : {
    9410       233876 :   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
    9411              : 
    9412              :   /* First walk the SLP graph assigning each involved scalar stmt a
    9413              :      corresponding SLP graph entry and upon visiting a previously
    9414              :      marked stmt, make the stmts leader the current SLP graph entry.  */
    9415       233876 :   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
    9416       233876 :   hash_map<slp_tree, slp_instance> node_to_instance;
    9417       233876 :   hash_map<slp_instance, slp_instance> instance_leader;
    9418       233876 :   slp_instance instance;
    9419       907895 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9420              :     {
    9421       674019 :       instance_leader.put (instance, instance);
    9422       674019 :       vect_bb_partition_graph_r (bb_vinfo,
    9423              :                                  instance, SLP_INSTANCE_TREE (instance),
    9424              :                                  stmt_to_instance, node_to_instance,
    9425              :                                  instance_leader);
    9426              :     }
    9427              : 
    9428              :   /* Then collect entries to each independent subgraph.  */
    9429      1141771 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9430              :     {
    9431       674019 :       slp_instance leader = get_ultimate_leader (instance, instance_leader);
    9432       674019 :       leader->subgraph_entries.safe_push (instance);
    9433       674019 :       if (dump_enabled_p ()
    9434       674019 :           && leader != instance)
    9435           69 :         dump_printf_loc (MSG_NOTE, vect_location,
    9436              :                          "instance %p is leader of %p\n",
    9437              :                          (void *) leader, (void *) instance);
    9438              :     }
    9439       233876 : }
    9440              : 
    9441              : /* Compute the set of scalar stmts participating in internal and external
    9442              :    nodes.  */
    9443              : 
    9444              : static void
    9445      1534727 : vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
    9446              :                                          hash_set<slp_tree> &visited,
    9447              :                                          hash_set<stmt_vec_info> &vstmts,
    9448              :                                          hash_set<stmt_vec_info> &estmts)
    9449              : {
    9450      1534727 :   int i;
    9451      1534727 :   stmt_vec_info stmt_info;
    9452      1534727 :   slp_tree child;
    9453              : 
    9454      1534727 :   if (visited.add (node))
    9455        41271 :     return;
    9456              : 
    9457      1493456 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    9458              :     {
    9459      3058657 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9460      2205062 :         if (stmt_info)
    9461      2205062 :           vstmts.add (stmt_info);
    9462              : 
    9463      3109305 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9464       864177 :         if (child)
    9465       864177 :           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
    9466              :                                                    vstmts, estmts);
    9467              :     }
    9468              :   else
    9469      3588113 :     for (tree def : SLP_TREE_SCALAR_OPS (node))
    9470              :       {
    9471      1669566 :         stmt_vec_info def_stmt = vinfo->lookup_def (def);
    9472      1669566 :         if (def_stmt)
    9473       329824 :           estmts.add (def_stmt);
    9474              :       }
    9475              : }
    9476              : 
    9477              : 
    9478              : /* Compute the scalar cost of the SLP node NODE and its children
    9479              :    and return it.  Do not account defs that are marked in LIFE and
    9480              :    update LIFE according to uses of NODE.  */
    9481              : 
    9482              : static void
    9483       894093 : vect_bb_slp_scalar_cost (vec_info *vinfo,
    9484              :                          slp_tree node, vec<bool, va_heap> *life,
    9485              :                          stmt_vector_for_cost *cost_vec,
    9486              :                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
    9487              :                          hash_set<stmt_vec_info> &scalar_stmts_in_externs,
    9488              :                          hash_set<slp_tree> &visited)
    9489              : {
    9490       894093 :   unsigned i;
    9491       894093 :   stmt_vec_info stmt_info;
    9492       894093 :   slp_tree child;
    9493              : 
    9494       894093 :   if (visited.add (node))
    9495        40481 :     return;
    9496              : 
    9497      3058708 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9498              :     {
    9499      2205096 :       ssa_op_iter op_iter;
    9500      2205096 :       def_operand_p def_p;
    9501              : 
    9502      2236402 :       if (!stmt_info
    9503      2205096 :           || (*life)[i]
    9504              :           /* Defs also used in external nodes are not in the
    9505              :              vectorized_scalar_stmts set as they need to be preserved.
    9506              :              Honor that.  */
    9507      4381703 :           || scalar_stmts_in_externs.contains (stmt_info))
    9508       104211 :         continue;
    9509              : 
    9510      2173790 :       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
    9511      2173790 :       gimple *orig_stmt = orig_stmt_info->stmt;
    9512              : 
    9513              :       /* If there is a non-vectorized use of the defs then the scalar
    9514              :          stmt is kept live in which case we do not account it or any
    9515              :          required defs in the SLP children in the scalar cost.  This
    9516              :          way we make the vectorization more costly when compared to
    9517              :          the scalar cost.  */
    9518      2173790 :       if (!STMT_VINFO_LIVE_P (stmt_info))
    9519              :         {
    9520      2107047 :           auto_vec<gimple *, 8> worklist;
    9521      2107047 :           hash_set<gimple *> *worklist_visited = NULL;
    9522      2107047 :           worklist.quick_push (orig_stmt);
    9523      2112047 :           do
    9524              :             {
    9525      2112047 :               gimple *work_stmt = worklist.pop ();
    9526      4619010 :               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
    9527              :                 {
    9528       415261 :                   imm_use_iterator use_iter;
    9529       415261 :                   gimple *use_stmt;
    9530      1036643 :                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
    9531              :                                          DEF_FROM_PTR (def_p))
    9532       641727 :                     if (!is_gimple_debug (use_stmt))
    9533              :                       {
    9534       489035 :                         stmt_vec_info use_stmt_info
    9535       489035 :                           = vinfo->lookup_stmt (use_stmt);
    9536       489035 :                         if (!use_stmt_info
    9537       489035 :                             || !vectorized_scalar_stmts.contains (use_stmt_info))
    9538              :                           {
    9539        25445 :                             if (use_stmt_info
    9540        22368 :                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
    9541              :                               {
    9542              :                                 /* For stmts participating in patterns we have
    9543              :                                    to check its uses recursively.  */
    9544         5100 :                                 if (!worklist_visited)
    9545         3959 :                                   worklist_visited = new hash_set<gimple *> ();
    9546         5100 :                                 if (!worklist_visited->add (use_stmt))
    9547         5100 :                                   worklist.safe_push (use_stmt);
    9548         5100 :                                 continue;
    9549              :                               }
    9550        20345 :                             (*life)[i] = true;
    9551        20345 :                             goto next_lane;
    9552              :                           }
    9553       415261 :                       }
    9554              :                 }
    9555              :             }
    9556      4183404 :           while (!worklist.is_empty ());
    9557      2086702 : next_lane:
    9558      2107047 :           if (worklist_visited)
    9559         3959 :             delete worklist_visited;
    9560      2107047 :           if ((*life)[i])
    9561        20345 :             continue;
    9562      2107047 :         }
    9563              : 
    9564              :       /* Count scalar stmts only once.  */
    9565      2153445 :       if (gimple_visited_p (orig_stmt))
    9566        24620 :         continue;
    9567      2128825 :       gimple_set_visited (orig_stmt, true);
    9568              : 
    9569      2128825 :       vect_cost_for_stmt kind;
    9570      2128825 :       if (STMT_VINFO_DATA_REF (orig_stmt_info))
    9571              :         {
    9572      1931889 :           data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
    9573      1931889 :           tree base = get_base_address (DR_REF (dr));
    9574              :           /* When the scalar access is to a non-global not address-taken
    9575              :              decl that is not BLKmode assume we can access it with a single
    9576              :              non-load/store instruction.  */
    9577      1931889 :           if (DECL_P (base)
    9578      1495890 :               && !is_global_var (base)
    9579      1419844 :               && !TREE_ADDRESSABLE (base)
    9580      2482419 :               && DECL_MODE (base) != BLKmode)
    9581              :             kind = scalar_stmt;
    9582      1788582 :           else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
    9583              :             kind = scalar_load;
    9584              :           else
    9585      1569876 :             kind = scalar_store;
    9586              :         }
    9587       196936 :       else if (vect_nop_conversion_p (orig_stmt_info))
    9588        19945 :         continue;
    9589              :       /* For single-argument PHIs assume coalescing which means zero cost
    9590              :          for the scalar and the vector PHIs.  This avoids artificially
    9591              :          favoring the vector path (but may pessimize it in some cases).  */
    9592       176991 :       else if (is_a <gphi *> (orig_stmt_info->stmt)
    9593       176991 :                && gimple_phi_num_args
    9594        83469 :                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
    9595         7995 :         continue;
    9596              :       else
    9597              :         kind = scalar_stmt;
    9598      2100885 :       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
    9599              :                         SLP_TREE_VECTYPE (node), 0, vect_body);
    9600              :     }
    9601              : 
    9602      1707224 :   auto_vec<bool, 20> subtree_life;
    9603      2469494 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9604              :     {
    9605       864201 :       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9606              :         {
    9607              :           /* Do not directly pass LIFE to the recursive call, copy it to
    9608              :              confine changes in the callee to the current child/subtree.  */
    9609       223543 :           if (SLP_TREE_PERMUTE_P (node))
    9610              :             {
    9611         3496 :               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
    9612        12240 :               for (unsigned j = 0;
    9613        12240 :                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
    9614              :                 {
    9615         8744 :                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
    9616         8744 :                   if (perm.first == i)
    9617         4600 :                     subtree_life[perm.second] = (*life)[j];
    9618              :                 }
    9619              :             }
    9620              :           else
    9621              :             {
    9622       220047 :               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
    9623       220047 :               subtree_life.safe_splice (*life);
    9624              :             }
    9625       223543 :           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
    9626              :                                    vectorized_scalar_stmts,
    9627              :                                    scalar_stmts_in_externs, visited);
    9628       223543 :           subtree_life.truncate (0);
    9629              :         }
    9630              :     }
    9631              : }
    9632              : 
    9633              : /* Comparator for the loop-index sorted cost vectors.  */
    9634              : 
    9635              : static int
    9636     17466825 : li_cost_vec_cmp (const void *a_, const void *b_)
    9637              : {
    9638     17466825 :   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
    9639     17466825 :   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
    9640     17466825 :   if (a->first < b->first)
    9641              :     return -1;
    9642     16707272 :   else if (a->first == b->first)
    9643     16036403 :     return 0;
    9644              :   return 1;
    9645              : }
    9646              : 
    9647              : /* Check if vectorization of the basic block is profitable for the
    9648              :    subgraph denoted by SLP_INSTANCES.  */
    9649              : 
    9650              : static bool
    9651       651543 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
    9652              :                                     vec<slp_instance> slp_instances,
    9653              :                                     loop_p orig_loop)
    9654              : {
    9655       651543 :   slp_instance instance;
    9656       651543 :   int i;
    9657       651543 :   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
    9658       651543 :   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
    9659              : 
    9660       651543 :   if (dump_enabled_p ())
    9661              :     {
    9662           98 :       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
    9663           98 :       hash_set<slp_tree> visited;
    9664          395 :       FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9665          101 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    9666              :                               SLP_INSTANCE_TREE (instance), visited);
    9667           98 :     }
    9668              : 
    9669              :   /* Compute the set of scalar stmts we know will go away 'locally' when
    9670              :      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
    9671              :      not accurate for nodes promoted extern late or for scalar stmts that
    9672              :      are used both in extern defs and in vectorized defs.  */
    9673       651543 :   hash_set<stmt_vec_info> vectorized_scalar_stmts;
    9674       651543 :   hash_set<stmt_vec_info> scalar_stmts_in_externs;
    9675       651543 :   hash_set<slp_tree> visited;
    9676      1322093 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9677              :     {
    9678       670550 :       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
    9679              :                                                SLP_INSTANCE_TREE (instance),
    9680              :                                                visited,
    9681              :                                                vectorized_scalar_stmts,
    9682              :                                                scalar_stmts_in_externs);
    9683       777988 :       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
    9684        51350 :         vectorized_scalar_stmts.add (rstmt);
    9685              :     }
    9686              :   /* Scalar stmts used as defs in external nodes need to be preseved, so
    9687              :      remove them from vectorized_scalar_stmts.  */
    9688       946169 :   for (stmt_vec_info stmt : scalar_stmts_in_externs)
    9689       294626 :     vectorized_scalar_stmts.remove (stmt);
    9690              : 
    9691              :   /* Calculate scalar cost and sum the cost for the vector stmts
    9692              :      previously collected.  */
    9693       651543 :   stmt_vector_for_cost scalar_costs = vNULL;
    9694       651543 :   stmt_vector_for_cost vector_costs = vNULL;
    9695       651543 :   visited.empty ();
    9696      1322093 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9697              :     {
    9698       670550 :       auto_vec<bool, 20> life;
    9699       670550 :       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
    9700              :                               true);
    9701       670550 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9702        56088 :         record_stmt_cost (&scalar_costs,
    9703        28044 :                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
    9704              :                           scalar_stmt,
    9705        28044 :                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
    9706       670550 :       vect_bb_slp_scalar_cost (bb_vinfo,
    9707              :                                SLP_INSTANCE_TREE (instance),
    9708              :                                &life, &scalar_costs, vectorized_scalar_stmts,
    9709              :                                scalar_stmts_in_externs, visited);
    9710       670550 :       vector_costs.safe_splice (instance->cost_vec);
    9711       670550 :       instance->cost_vec.release ();
    9712       670550 :     }
    9713              : 
    9714       651543 :   if (dump_enabled_p ())
    9715           98 :     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
    9716              : 
    9717              :   /* When costing non-loop vectorization we need to consider each covered
    9718              :      loop independently and make sure vectorization is profitable.  For
    9719              :      now we assume a loop may be not entered or executed an arbitrary
    9720              :      number of iterations (???  static information can provide more
    9721              :      precise info here) which means we can simply cost each containing
    9722              :      loops stmts separately.  */
    9723              : 
    9724              :   /* First produce cost vectors sorted by loop index.  */
    9725       651543 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9726       651543 :     li_scalar_costs (scalar_costs.length ());
    9727       651543 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9728       651543 :     li_vector_costs (vector_costs.length ());
    9729       651543 :   stmt_info_for_cost *cost;
    9730      2780472 :   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9731              :     {
    9732      2128929 :       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9733      2128929 :       li_scalar_costs.quick_push (std::make_pair (l, cost));
    9734              :     }
    9735              :   /* Use a random used loop as fallback in case the first vector_costs
    9736              :      entry does not have a stmt_info associated with it.  */
    9737       651543 :   unsigned l = li_scalar_costs[0].first;
    9738      2402080 :   FOR_EACH_VEC_ELT (vector_costs, i, cost)
    9739              :     {
    9740              :       /* We inherit from the previous COST, invariants, externals and
    9741              :          extracts immediately follow the cost for the related stmt.  */
    9742      1750537 :       if (cost->stmt_info)
    9743      1036525 :         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9744      1750537 :       li_vector_costs.quick_push (std::make_pair (l, cost));
    9745              :     }
    9746       651543 :   li_scalar_costs.qsort (li_cost_vec_cmp);
    9747       651543 :   li_vector_costs.qsort (li_cost_vec_cmp);
    9748              : 
    9749              :   /* Now cost the portions individually.  */
    9750              :   unsigned vi = 0;
    9751              :   unsigned si = 0;
    9752      1131665 :   bool profitable = true;
    9753      1131665 :   while (si < li_scalar_costs.length ()
    9754      1788006 :          && vi < li_vector_costs.length ())
    9755              :     {
    9756       656341 :       unsigned sl = li_scalar_costs[si].first;
    9757       656341 :       unsigned vl = li_vector_costs[vi].first;
    9758       656341 :       if (sl != vl)
    9759              :         {
    9760         1219 :           if (dump_enabled_p ())
    9761            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    9762              :                              "Scalar %d and vector %d loop part do not "
    9763              :                              "match up, skipping scalar part\n", sl, vl);
    9764              :           /* Skip the scalar part, assuming zero cost on the vector side.  */
    9765         2640 :           do
    9766              :             {
    9767         2640 :               si++;
    9768              :             }
    9769         2640 :           while (si < li_scalar_costs.length ()
    9770         4730 :                  && li_scalar_costs[si].first == sl);
    9771         1219 :           continue;
    9772              :         }
    9773              : 
    9774       655122 :       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
    9775      2109151 :       do
    9776              :         {
    9777      2109151 :           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
    9778      2109151 :           si++;
    9779              :         }
    9780      2109151 :       while (si < li_scalar_costs.length ()
    9781      4225835 :              && li_scalar_costs[si].first == sl);
    9782       655122 :       scalar_target_cost_data->finish_cost (nullptr);
    9783       655122 :       scalar_cost = (scalar_target_cost_data->body_cost ()
    9784       655122 :                      * param_vect_scalar_cost_multiplier) / 100;
    9785              : 
    9786              :       /* Complete the target-specific vector cost calculation.  */
    9787       655122 :       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
    9788      1716563 :       do
    9789              :         {
    9790      1716563 :           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
    9791      1716563 :           vi++;
    9792              :         }
    9793      1716563 :       while (vi < li_vector_costs.length ()
    9794      3441792 :              && li_vector_costs[vi].first == vl);
    9795       655122 :       vect_target_cost_data->finish_cost (scalar_target_cost_data);
    9796       655122 :       vec_prologue_cost = vect_target_cost_data->prologue_cost ();
    9797       655122 :       vec_inside_cost = vect_target_cost_data->body_cost ();
    9798       655122 :       vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
    9799       655122 :       delete scalar_target_cost_data;
    9800       655122 :       delete vect_target_cost_data;
    9801              : 
    9802       655122 :       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
    9803              : 
    9804       655122 :       if (dump_enabled_p ())
    9805              :         {
    9806           98 :           dump_printf_loc (MSG_NOTE, vect_location,
    9807              :                            "Cost model analysis for part in loop %d:\n", sl);
    9808           98 :           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
    9809              :                        vec_inside_cost + vec_outside_cost);
    9810           98 :           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
    9811              :         }
    9812              : 
    9813              :       /* Vectorization is profitable if its cost is more than the cost of scalar
    9814              :          version.  Note that we err on the vector side for equal cost because
    9815              :          the cost estimate is otherwise quite pessimistic (constant uses are
    9816              :          free on the scalar side but cost a load on the vector side for
    9817              :          example).  */
    9818       655122 :       if (vec_outside_cost + vec_inside_cost > scalar_cost)
    9819              :         {
    9820              :           profitable = false;
    9821              :           break;
    9822              :         }
    9823              :     }
    9824      1126854 :   if (profitable && vi < li_vector_costs.length ())
    9825              :     {
    9826         1151 :       if (dump_enabled_p ())
    9827           12 :         dump_printf_loc (MSG_NOTE, vect_location,
    9828              :                          "Excess vector cost for part in loop %d:\n",
    9829            6 :                          li_vector_costs[vi].first);
    9830              :       profitable = false;
    9831              :     }
    9832              : 
    9833              :   /* Unset visited flag.  This is delayed when the subgraph is profitable
    9834              :      and we process the loop for remaining unvectorized if-converted code.  */
    9835       651543 :   if (!orig_loop || !profitable)
    9836      2779185 :     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9837      2127731 :       gimple_set_visited  (cost->stmt_info->stmt, false);
    9838              : 
    9839       651543 :   scalar_costs.release ();
    9840       651543 :   vector_costs.release ();
    9841              : 
    9842       651543 :   return profitable;
    9843       651543 : }
    9844              : 
    9845              : /* qsort comparator for lane defs.  */
    9846              : 
    9847              : static int
    9848           40 : vld_cmp (const void *a_, const void *b_)
    9849              : {
    9850           40 :   auto *a = (const std::pair<unsigned, tree> *)a_;
    9851           40 :   auto *b = (const std::pair<unsigned, tree> *)b_;
    9852           40 :   return a->first - b->first;
    9853              : }
    9854              : 
    9855              : /* Return true if USE_STMT is a vector lane insert into VEC and set
    9856              :    *THIS_LANE to the lane number that is set.  */
    9857              : 
    9858              : static bool
    9859          248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
    9860              : {
    9861          248 :   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
    9862           91 :   if (!use_ass
    9863           91 :       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
    9864           22 :       || (vec
    9865           22 :           ? gimple_assign_rhs1 (use_ass) != vec
    9866           24 :           : ((vec = gimple_assign_rhs1 (use_ass)), false))
    9867           46 :       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
    9868           46 :                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
    9869           46 :       || !constant_multiple_p
    9870           46 :             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
    9871           92 :              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
    9872              :              this_lane))
    9873          202 :     return false;
    9874              :   return true;
    9875              : }
    9876              : 
    9877              : /* Find any vectorizable constructors and add them to the grouped_store
    9878              :    array.  */
    9879              : 
    9880              : static void
    9881      2197543 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
    9882              : {
    9883     17788150 :   for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
    9884     31181214 :     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
    9885    135793589 :          !gsi_end_p (gsi); gsi_next (&gsi))
    9886              :     {
    9887    120202982 :       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
    9888              :       /* This can be used to start SLP discovery for early breaks for BB early breaks
    9889              :          when we get that far.  */
    9890    120202982 :       if (!assign)
    9891    180435275 :         continue;
    9892              : 
    9893     30899947 :       tree rhs = gimple_assign_rhs1 (assign);
    9894     30899947 :       enum tree_code code = gimple_assign_rhs_code (assign);
    9895     30899947 :       use_operand_p use_p;
    9896     30899947 :       gimple *use_stmt;
    9897     30899947 :       if (code == CONSTRUCTOR)
    9898              :         {
    9899      1564360 :           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
    9900        62281 :               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
    9901        91159 :                            CONSTRUCTOR_NELTS (rhs))
    9902        42169 :               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
    9903      1606529 :               || uniform_vector_p (rhs))
    9904      1551909 :             continue;
    9905              : 
    9906              :           unsigned j;
    9907              :           tree val;
    9908        61225 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9909        48774 :             if (TREE_CODE (val) != SSA_NAME
    9910        48774 :                 || !bb_vinfo->lookup_def (val))
    9911              :               break;
    9912        30674 :           if (j != CONSTRUCTOR_NELTS (rhs))
    9913         2886 :             continue;
    9914              : 
    9915        12451 :           vec<stmt_vec_info> roots = vNULL;
    9916        12451 :           roots.safe_push (bb_vinfo->lookup_stmt (assign));
    9917        12451 :           vec<stmt_vec_info> stmts;
    9918        12451 :           stmts.create (CONSTRUCTOR_NELTS (rhs));
    9919        69216 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9920        44314 :             stmts.quick_push
    9921        44314 :               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
    9922        12451 :           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
    9923        12451 :                                                stmts, roots));
    9924              :         }
    9925     29335587 :       else if (code == BIT_INSERT_EXPR
    9926          924 :                && VECTOR_TYPE_P (TREE_TYPE (rhs))
    9927          606 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
    9928          606 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
    9929          603 :                && integer_zerop (gimple_assign_rhs3 (assign))
    9930          341 :                && useless_type_conversion_p
    9931          341 :                     (TREE_TYPE (TREE_TYPE (rhs)),
    9932          341 :                      TREE_TYPE (gimple_assign_rhs2 (assign)))
    9933     29336209 :                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
    9934              :         {
    9935              :           /* We start to match on insert to lane zero but since the
    9936              :              inserts need not be ordered we'd have to search both
    9937              :              the def and the use chains.  */
    9938          215 :           tree vectype = TREE_TYPE (rhs);
    9939          215 :           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
    9940          215 :           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
    9941          215 :           auto_sbitmap lanes (nlanes);
    9942          215 :           bitmap_clear (lanes);
    9943          215 :           bitmap_set_bit (lanes, 0);
    9944          215 :           tree def = gimple_assign_lhs (assign);
    9945          215 :           lane_defs.quick_push
    9946          215 :                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
    9947          215 :           unsigned lanes_found = 1;
    9948              :           /* Start with the use chains, the last stmt will be the root.  */
    9949          215 :           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
    9950          215 :           vec<stmt_vec_info> roots = vNULL;
    9951          215 :           roots.safe_push (last);
    9952          217 :           do
    9953              :             {
    9954          217 :               use_operand_p use_p;
    9955          217 :               gimple *use_stmt;
    9956          217 :               if (!single_imm_use (def, &use_p, &use_stmt))
    9957              :                 break;
    9958          211 :               unsigned this_lane;
    9959          211 :               if (!bb_vinfo->lookup_stmt (use_stmt)
    9960          211 :                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
    9961          233 :                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
    9962              :                 break;
    9963           22 :               if (bitmap_bit_p (lanes, this_lane))
    9964              :                 break;
    9965            2 :               lanes_found++;
    9966            2 :               bitmap_set_bit (lanes, this_lane);
    9967            2 :               gassign *use_ass = as_a <gassign *> (use_stmt);
    9968            2 :               lane_defs.quick_push (std::make_pair
    9969            2 :                                      (this_lane, gimple_assign_rhs2 (use_ass)));
    9970            2 :               last = bb_vinfo->lookup_stmt (use_ass);
    9971            2 :               roots.safe_push (last);
    9972            2 :               def = gimple_assign_lhs (use_ass);
    9973              :             }
    9974            2 :           while (lanes_found < nlanes);
    9975          215 :           if (roots.length () > 1)
    9976            2 :             std::swap(roots[0], roots[roots.length () - 1]);
    9977          215 :           if (lanes_found < nlanes)
    9978              :             {
    9979              :               /* Now search the def chain.  */
    9980          215 :               def = gimple_assign_rhs1 (assign);
    9981          217 :               do
    9982              :                 {
    9983          217 :                   if (TREE_CODE (def) != SSA_NAME
    9984          217 :                       || !has_single_use (def))
    9985              :                     break;
    9986           56 :                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
    9987           56 :                   unsigned this_lane;
    9988           56 :                   if (!bb_vinfo->lookup_stmt (def_stmt)
    9989           37 :                       || !vect_slp_is_lane_insert (def_stmt,
    9990              :                                                    NULL_TREE, &this_lane)
    9991           80 :                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
    9992              :                     break;
    9993           24 :                   if (bitmap_bit_p (lanes, this_lane))
    9994              :                     break;
    9995            4 :                   lanes_found++;
    9996            4 :                   bitmap_set_bit (lanes, this_lane);
    9997            8 :                   lane_defs.quick_push (std::make_pair
    9998            4 :                                           (this_lane,
    9999            4 :                                            gimple_assign_rhs2 (def_stmt)));
   10000            4 :                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
   10001            4 :                   def = gimple_assign_rhs1 (def_stmt);
   10002              :                 }
   10003            4 :               while (lanes_found < nlanes);
   10004              :             }
   10005          215 :           if (lanes_found == nlanes)
   10006              :             {
   10007              :               /* Sort lane_defs after the lane index and register the root.  */
   10008            2 :               lane_defs.qsort (vld_cmp);
   10009            2 :               vec<stmt_vec_info> stmts;
   10010            2 :               stmts.create (nlanes);
   10011           10 :               for (unsigned i = 0; i < nlanes; ++i)
   10012            8 :                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
   10013            2 :               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
   10014            2 :                                                    stmts, roots));
   10015              :             }
   10016              :           else
   10017          213 :             roots.release ();
   10018          215 :         }
   10019     29335372 :       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
   10020     28371967 :                && (associative_tree_code (code) || code == MINUS_EXPR)
   10021              :                /* ???  This pessimizes a two-element reduction.  PR54400.
   10022              :                   ???  In-order reduction could be handled if we only
   10023              :                   traverse one operand chain in vect_slp_linearize_chain.  */
   10024     33276033 :                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
   10025              :                /* Ops with constants at the tail can be stripped here.  */
   10026      5803578 :                && TREE_CODE (rhs) == SSA_NAME
   10027      5744083 :                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
   10028              :                /* Should be the chain end.  */
   10029     31605333 :                && (!single_imm_use (gimple_assign_lhs (assign),
   10030              :                                     &use_p, &use_stmt)
   10031      1750982 :                    || !is_gimple_assign (use_stmt)
   10032      1191228 :                    || (gimple_assign_rhs_code (use_stmt) != code
   10033       882799 :                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
   10034       500043 :                            || (gimple_assign_rhs_code (use_stmt)
   10035       500043 :                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
   10036              :         {
   10037              :           /* We start the match at the end of a possible association
   10038              :              chain.  */
   10039      1862917 :           auto_vec<chain_op_t> chain;
   10040      1862917 :           auto_vec<std::pair<tree_code, gimple *> > worklist;
   10041      1862917 :           auto_vec<gimple *> chain_stmts;
   10042      1862917 :           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
   10043      1862917 :           if (code == MINUS_EXPR)
   10044       306702 :             code = PLUS_EXPR;
   10045      1862917 :           internal_fn reduc_fn;
   10046      2140213 :           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
   10047      1862917 :               || reduc_fn == IFN_LAST)
   10048       277296 :             continue;
   10049      1585621 :           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
   10050              :                                     /* ??? */
   10051              :                                     code_stmt, alt_code_stmt, &chain_stmts);
   10052      3171242 :           if (chain.length () > 1)
   10053              :             {
   10054              :               /* Sort the chain according to def_type and operation.  */
   10055      1585621 :               chain.sort (dt_sort_cmp, bb_vinfo);
   10056              :               /* ???  Now we'd want to strip externals and constants
   10057              :                  but record those to be handled in the epilogue.  */
   10058              :               /* ???  For now do not allow mixing ops or externs/constants.  */
   10059      1585621 :               bool invalid = false;
   10060      1585621 :               unsigned remain_cnt = 0;
   10061      1585621 :               unsigned last_idx = 0;
   10062      4781609 :               for (unsigned i = 0; i < chain.length (); ++i)
   10063              :                 {
   10064      3525796 :                   if (chain[i].code != code)
   10065              :                     {
   10066              :                       invalid = true;
   10067              :                       break;
   10068              :                     }
   10069      3195988 :                   if (chain[i].dt != vect_internal_def
   10070              :                       /* Avoid stmts where the def is not the LHS, like
   10071              :                          ASMs.  */
   10072      6161971 :                       || (gimple_get_lhs (bb_vinfo->lookup_def
   10073      2965983 :                                                       (chain[i].op)->stmt)
   10074      2965983 :                           != chain[i].op))
   10075       232949 :                     remain_cnt++;
   10076              :                   else
   10077              :                     last_idx = i;
   10078              :                 }
   10079              :               /* Make sure to have an even number of lanes as we later do
   10080              :                  all-or-nothing discovery, not trying to split further.  */
   10081      1585621 :               if ((chain.length () - remain_cnt) & 1)
   10082       185443 :                 remain_cnt++;
   10083      1585621 :               if (!invalid && chain.length () - remain_cnt > 1)
   10084              :                 {
   10085      1187636 :                   vec<stmt_vec_info> stmts;
   10086      1187636 :                   vec<tree> remain = vNULL;
   10087      1187636 :                   stmts.create (chain.length ());
   10088      1187636 :                   if (remain_cnt > 0)
   10089       110141 :                     remain.create (remain_cnt);
   10090      3816764 :                   for (unsigned i = 0; i < chain.length (); ++i)
   10091              :                     {
   10092      2629128 :                       stmt_vec_info stmt_info;
   10093      2629128 :                       if (chain[i].dt == vect_internal_def
   10094      2592312 :                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
   10095      2592312 :                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
   10096      5221356 :                           && (i != last_idx
   10097      1187636 :                               || (stmts.length () & 1)))
   10098      2507650 :                         stmts.quick_push (stmt_info);
   10099              :                       else
   10100       121478 :                         remain.quick_push (chain[i].op);
   10101              :                     }
   10102      1187636 :                   vec<stmt_vec_info> roots;
   10103      1187636 :                   roots.create (chain_stmts.length ());
   10104      2629128 :                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
   10105      1441492 :                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
   10106      1187636 :                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
   10107      1187636 :                                                        stmts, roots, remain));
   10108              :                 }
   10109              :             }
   10110      1862917 :         }
   10111              :     }
   10112      2197543 : }
   10113              : 
   10114              : /* Walk the grouped store chains and replace entries with their
   10115              :    pattern variant if any.  */
   10116              : 
   10117              : static void
   10118       609003 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
   10119              : {
   10120       609003 :   stmt_vec_info first_element;
   10121       609003 :   unsigned i;
   10122              : 
   10123      1491620 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
   10124              :     {
   10125              :       /* We also have CTORs in this array.  */
   10126       882617 :       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
   10127            0 :         continue;
   10128       882617 :       if (STMT_VINFO_IN_PATTERN_P (first_element))
   10129              :         {
   10130          254 :           stmt_vec_info orig = first_element;
   10131          254 :           first_element = STMT_VINFO_RELATED_STMT (first_element);
   10132          254 :           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
   10133          254 :           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
   10134          254 :           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
   10135          254 :           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
   10136          254 :           vinfo->grouped_stores[i] = first_element;
   10137              :         }
   10138       882617 :       stmt_vec_info prev = first_element;
   10139      2478507 :       while (DR_GROUP_NEXT_ELEMENT (prev))
   10140              :         {
   10141      1595890 :           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
   10142      1595890 :           if (STMT_VINFO_IN_PATTERN_P (elt))
   10143              :             {
   10144          893 :               stmt_vec_info orig = elt;
   10145          893 :               elt = STMT_VINFO_RELATED_STMT (elt);
   10146          893 :               DR_GROUP_NEXT_ELEMENT (prev) = elt;
   10147          893 :               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
   10148          893 :               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
   10149              :             }
   10150      1595890 :           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
   10151      1595890 :           prev = elt;
   10152              :         }
   10153              :     }
   10154       609003 : }
   10155              : 
   10156              : /* Check if the region described by BB_VINFO can be vectorized, returning
   10157              :    true if so.  When returning false, set FATAL to true if the same failure
   10158              :    would prevent vectorization at other vector sizes, false if it is still
   10159              :    worth trying other sizes.  N_STMTS is the number of statements in the
   10160              :    region.  */
   10161              : 
   10162              : static bool
   10163      2197543 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
   10164              :                        vec<int> *dataref_groups)
   10165              : {
   10166      2197543 :   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
   10167              : 
   10168      2197543 :   slp_instance instance;
   10169      2197543 :   int i;
   10170              : 
   10171              :   /* The first group of checks is independent of the vector size.  */
   10172      2197543 :   fatal = true;
   10173              : 
   10174              :   /* Analyze the data references.  */
   10175              : 
   10176      2197543 :   if (!vect_analyze_data_refs (bb_vinfo, NULL))
   10177              :     {
   10178            0 :       if (dump_enabled_p ())
   10179            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10180              :                          "not vectorized: unhandled data-ref in basic "
   10181              :                          "block.\n");
   10182            0 :       return false;
   10183              :     }
   10184              : 
   10185      2197543 :   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
   10186              :     {
   10187            0 :      if (dump_enabled_p ())
   10188            0 :        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10189              :                         "not vectorized: unhandled data access in "
   10190              :                         "basic block.\n");
   10191            0 :       return false;
   10192              :     }
   10193              : 
   10194      2197543 :   vect_slp_check_for_roots (bb_vinfo);
   10195              : 
   10196              :   /* If there are no grouped stores and no constructors in the region
   10197              :      there is no need to continue with pattern recog as vect_analyze_slp
   10198              :      will fail anyway.  */
   10199      2197543 :   if (bb_vinfo->grouped_stores.is_empty ()
   10200      1856767 :       && bb_vinfo->roots.is_empty ())
   10201              :     {
   10202      1588540 :       if (dump_enabled_p ())
   10203         1022 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10204              :                          "not vectorized: no grouped stores in "
   10205              :                          "basic block.\n");
   10206      1588540 :       return false;
   10207              :     }
   10208              : 
   10209              :   /* While the rest of the analysis below depends on it in some way.  */
   10210       609003 :   fatal = false;
   10211              : 
   10212       609003 :   vect_pattern_recog (bb_vinfo);
   10213              : 
   10214              :   /* Update store groups from pattern processing.  */
   10215       609003 :   vect_fixup_store_groups_with_patterns (bb_vinfo);
   10216              : 
   10217              :   /* Check the SLP opportunities in the basic block, analyze and build SLP
   10218              :      trees.  */
   10219       609003 :   if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
   10220              :     {
   10221            0 :       if (dump_enabled_p ())
   10222              :         {
   10223            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10224              :                            "Failed to SLP the basic block.\n");
   10225            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10226              :                            "not vectorized: failed to find SLP opportunities "
   10227              :                            "in basic block.\n");
   10228              :         }
   10229            0 :       return false;
   10230              :     }
   10231              : 
   10232              :   /* Optimize permutations.  */
   10233       609003 :   vect_optimize_slp (bb_vinfo);
   10234              : 
   10235              :   /* Gather the loads reachable from the SLP graph entries.  */
   10236       609003 :   vect_gather_slp_loads (bb_vinfo);
   10237              : 
   10238       609003 :   vect_record_base_alignments (bb_vinfo);
   10239              : 
   10240              :   /* Analyze and verify the alignment of data references and the
   10241              :      dependence in the SLP instances.  */
   10242      1390636 :   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
   10243              :     {
   10244       781633 :       vect_location = instance->location ();
   10245       781633 :       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
   10246       781633 :           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
   10247              :         {
   10248         8405 :           slp_tree node = SLP_INSTANCE_TREE (instance);
   10249         8405 :           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   10250         8405 :           if (dump_enabled_p ())
   10251            4 :             dump_printf_loc (MSG_NOTE, vect_location,
   10252              :                              "removing SLP instance operations starting from: %G",
   10253              :                              stmt_info->stmt);
   10254         8405 :           vect_free_slp_instance (instance);
   10255         8405 :           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
   10256         8405 :           continue;
   10257         8405 :         }
   10258              : 
   10259              :       /* Mark all the statements that we want to vectorize as pure SLP and
   10260              :          relevant.  */
   10261       773228 :       vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
   10262       773228 :       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
   10263       773228 :       unsigned j;
   10264       773228 :       stmt_vec_info root;
   10265              :       /* Likewise consider instance root stmts as vectorized.  */
   10266      1707544 :       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
   10267       161088 :         STMT_SLP_TYPE (root) = pure_slp;
   10268              : 
   10269       773228 :       i++;
   10270              :     }
   10271      2227332 :   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
   10272              :     return false;
   10273              : 
   10274       263665 :   if (!vect_slp_analyze_operations (bb_vinfo))
   10275              :     {
   10276        29789 :       if (dump_enabled_p ())
   10277           81 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10278              :                          "not vectorized: bad operation in basic block.\n");
   10279        29789 :       return false;
   10280              :     }
   10281              : 
   10282       233876 :   vect_bb_partition_graph (bb_vinfo);
   10283              : 
   10284       233876 :   return true;
   10285              : }
   10286              : 
   10287              : /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
   10288              :    basic blocks in BBS, returning true on success.
   10289              :    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
   10290              : 
   10291              : static bool
   10292      1877972 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
   10293              :                  vec<int> *dataref_groups, unsigned int n_stmts,
   10294              :                  loop_p orig_loop)
   10295              : {
   10296      1877972 :   bb_vec_info bb_vinfo;
   10297      1877972 :   auto_vector_modes vector_modes;
   10298              : 
   10299              :   /* Autodetect first vector size we try.  */
   10300      1877972 :   machine_mode next_vector_mode = VOIDmode;
   10301      1877972 :   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
   10302      1877972 :   unsigned int mode_i = 0;
   10303              : 
   10304      1877972 :   vec_info_shared shared;
   10305              : 
   10306      1877972 :   machine_mode autodetected_vector_mode = VOIDmode;
   10307      2517114 :   while (1)
   10308              :     {
   10309      2197543 :       bool vectorized = false;
   10310      2197543 :       bool fatal = false;
   10311      2197543 :       bb_vinfo = new _bb_vec_info (bbs, &shared);
   10312              : 
   10313      2197543 :       bool first_time_p = shared.datarefs.is_empty ();
   10314      2197543 :       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
   10315      2197543 :       if (first_time_p)
   10316      1900248 :         bb_vinfo->shared->save_datarefs ();
   10317              :       else
   10318       297295 :         bb_vinfo->shared->check_datarefs ();
   10319      2197543 :       bb_vinfo->vector_mode = next_vector_mode;
   10320              : 
   10321      2197543 :       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
   10322              :         {
   10323       233876 :           if (dump_enabled_p ())
   10324              :             {
   10325         1502 :               dump_printf_loc (MSG_NOTE, vect_location,
   10326              :                                "***** Analysis succeeded with vector mode"
   10327          751 :                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
   10328          751 :               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
   10329              :             }
   10330              : 
   10331       233876 :           bb_vinfo->shared->check_datarefs ();
   10332              : 
   10333       233876 :           bool force_clear = false;
   10334       233876 :           auto_vec<slp_instance> profitable_subgraphs;
   10335      1375647 :           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
   10336              :             {
   10337       674019 :               if (instance->subgraph_entries.is_empty ())
   10338       215658 :                 continue;
   10339              : 
   10340       654875 :               dump_user_location_t saved_vect_location = vect_location;
   10341       654875 :               vect_location = instance->location ();
   10342       654875 :               if (!unlimited_cost_model (NULL)
   10343      1306418 :                   && !vect_bb_vectorization_profitable_p
   10344       651543 :                         (bb_vinfo, instance->subgraph_entries, orig_loop))
   10345              :                 {
   10346       177370 :                   if (dump_enabled_p ())
   10347           28 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10348              :                                      "not vectorized: vectorization is not "
   10349              :                                      "profitable.\n");
   10350       177370 :                   vect_location = saved_vect_location;
   10351       177370 :                   continue;
   10352              :                 }
   10353              : 
   10354       477505 :               vect_location = saved_vect_location;
   10355       477505 :               if (!dbg_cnt (vect_slp))
   10356              :                 {
   10357            0 :                   force_clear = true;
   10358            0 :                   continue;
   10359              :                 }
   10360              : 
   10361       477505 :               profitable_subgraphs.safe_push (instance);
   10362              :             }
   10363              : 
   10364              :           /* When we're vectorizing an if-converted loop body make sure
   10365              :              we vectorized all if-converted code.  */
   10366       391766 :           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
   10367              :             {
   10368           97 :               gcc_assert (bb_vinfo->nbbs == 1);
   10369          194 :               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
   10370         4084 :                    !gsi_end_p (gsi); gsi_next (&gsi))
   10371              :                 {
   10372              :                   /* The costing above left us with DCEable vectorized scalar
   10373              :                      stmts having the visited flag set on profitable
   10374              :                      subgraphs.  Do the delayed clearing of the flag here.  */
   10375         3987 :                   if (gimple_visited_p (gsi_stmt (gsi)))
   10376              :                     {
   10377         1172 :                       gimple_set_visited (gsi_stmt (gsi), false);
   10378         1172 :                       continue;
   10379              :                     }
   10380         2815 :                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
   10381          813 :                     continue;
   10382              : 
   10383         5859 :                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
   10384         2450 :                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
   10385              :                       {
   10386           51 :                         if (!profitable_subgraphs.is_empty ()
   10387           22 :                             && dump_enabled_p ())
   10388            0 :                           dump_printf_loc (MSG_NOTE, vect_location,
   10389              :                                            "not profitable because of "
   10390              :                                            "unprofitable if-converted scalar "
   10391              :                                            "code\n");
   10392           29 :                         profitable_subgraphs.truncate (0);
   10393              :                       }
   10394              :                 }
   10395              :             }
   10396              : 
   10397              :           /* Finally schedule the profitable subgraphs.  */
   10398      1027129 :           for (slp_instance instance : profitable_subgraphs)
   10399              :             {
   10400       477473 :               if (!vectorized && dump_enabled_p ())
   10401          726 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10402              :                                  "Basic block will be vectorized "
   10403              :                                  "using SLP\n");
   10404       477473 :               vectorized = true;
   10405              : 
   10406              :               /* Dump before scheduling as store vectorization will remove
   10407              :                  the original stores and mess with the instance tree
   10408              :                  so querying its location will eventually ICE.  */
   10409       477473 :               if (flag_checking)
   10410      1920263 :                 for (slp_instance sub : instance->subgraph_entries)
   10411       487844 :                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
   10412       477473 :               unsigned HOST_WIDE_INT bytes;
   10413       477473 :               if (dump_enabled_p ())
   10414         3457 :                 for (slp_instance sub : instance->subgraph_entries)
   10415              :                   {
   10416          916 :                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
   10417         1832 :                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
   10418          916 :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10419          916 :                                        sub->location (),
   10420              :                                        "basic block part vectorized using %wu "
   10421              :                                        "byte vectors\n", bytes);
   10422              :                     else
   10423              :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10424              :                                        sub->location (),
   10425              :                                        "basic block part vectorized using "
   10426              :                                        "variable length vectors\n");
   10427              :                   }
   10428              : 
   10429       477473 :               dump_user_location_t saved_vect_location = vect_location;
   10430       477473 :               vect_location = instance->location ();
   10431              : 
   10432       477473 :               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
   10433              : 
   10434       477473 :               vect_location = saved_vect_location;
   10435              :             }
   10436              : 
   10437              : 
   10438              :           /* Generate the invariant statements.  */
   10439       233876 :           if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
   10440              :             {
   10441           23 :               if (dump_enabled_p ())
   10442            0 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10443              :                          "------>generating invariant statements\n");
   10444              : 
   10445           23 :               bb_vinfo->insert_seq_on_entry (NULL,
   10446              :                                              bb_vinfo->inv_pattern_def_seq);
   10447              :             }
   10448       233876 :         }
   10449              :       else
   10450              :         {
   10451      1963667 :           if (dump_enabled_p ())
   10452         1314 :             dump_printf_loc (MSG_NOTE, vect_location,
   10453              :                              "***** Analysis failed with vector mode %s\n",
   10454         1314 :                              GET_MODE_NAME (bb_vinfo->vector_mode));
   10455              :         }
   10456              : 
   10457      2197543 :       if (mode_i == 0)
   10458      1877972 :         autodetected_vector_mode = bb_vinfo->vector_mode;
   10459              : 
   10460      2197543 :       if (!fatal)
   10461      3139013 :         while (mode_i < vector_modes.length ()
   10462      1751329 :                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
   10463              :           {
   10464       332467 :             if (dump_enabled_p ())
   10465         1654 :               dump_printf_loc (MSG_NOTE, vect_location,
   10466              :                                "***** The result for vector mode %s would"
   10467              :                                " be the same\n",
   10468          827 :                                GET_MODE_NAME (vector_modes[mode_i]));
   10469       332467 :             mode_i += 1;
   10470              :           }
   10471              : 
   10472      2197543 :       delete bb_vinfo;
   10473              : 
   10474      2197543 :       if (mode_i < vector_modes.length ()
   10475      2020865 :           && VECTOR_MODE_P (autodetected_vector_mode)
   10476      1997372 :           && (related_vector_mode (vector_modes[mode_i],
   10477              :                                    GET_MODE_INNER (autodetected_vector_mode))
   10478       998686 :               == autodetected_vector_mode)
   10479      4218408 :           && (related_vector_mode (autodetected_vector_mode,
   10480       520833 :                                    GET_MODE_INNER (vector_modes[mode_i]))
   10481      1041666 :               == vector_modes[mode_i]))
   10482              :         {
   10483       520833 :           if (dump_enabled_p ())
   10484          205 :             dump_printf_loc (MSG_NOTE, vect_location,
   10485              :                              "***** Skipping vector mode %s, which would"
   10486              :                              " repeat the analysis for %s\n",
   10487          205 :                              GET_MODE_NAME (vector_modes[mode_i]),
   10488          205 :                              GET_MODE_NAME (autodetected_vector_mode));
   10489       520833 :           mode_i += 1;
   10490              :         }
   10491              : 
   10492      2197543 :       if (vectorized
   10493      2039675 :           || mode_i == vector_modes.length ()
   10494      1863044 :           || autodetected_vector_mode == VOIDmode
   10495              :           /* If vect_slp_analyze_bb_1 signaled that analysis for all
   10496              :              vector sizes will fail do not bother iterating.  */
   10497      3038408 :           || fatal)
   10498      3755944 :         return vectorized;
   10499              : 
   10500              :       /* Try the next biggest vector size.  */
   10501       319571 :       next_vector_mode = vector_modes[mode_i++];
   10502       319571 :       if (dump_enabled_p ())
   10503          218 :         dump_printf_loc (MSG_NOTE, vect_location,
   10504              :                          "***** Re-trying analysis with vector mode %s\n",
   10505          218 :                          GET_MODE_NAME (next_vector_mode));
   10506       319571 :     }
   10507      1877972 : }
   10508              : 
   10509              : 
   10510              : /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
   10511              :    true if anything in the basic-block was vectorized.  */
   10512              : 
   10513              : static bool
   10514      1877972 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
   10515              : {
   10516      1877972 :   vec<data_reference_p> datarefs = vNULL;
   10517      1877972 :   auto_vec<int> dataref_groups;
   10518      1877972 :   int insns = 0;
   10519      1877972 :   int current_group = 0;
   10520              : 
   10521     12545866 :   for (unsigned i = 0; i < bbs.length (); i++)
   10522              :     {
   10523     10667894 :       basic_block bb = bbs[i];
   10524     88915180 :       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
   10525     78247286 :            gsi_next (&gsi))
   10526              :         {
   10527     78247286 :           gimple *stmt = gsi_stmt (gsi);
   10528     78247286 :           if (is_gimple_debug (stmt))
   10529     48554624 :             continue;
   10530              : 
   10531     29692662 :           insns++;
   10532              : 
   10533     29692662 :           if (gimple_location (stmt) != UNKNOWN_LOCATION)
   10534     26690447 :             vect_location = stmt;
   10535              : 
   10536     29692662 :           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
   10537              :                                               &dataref_groups, current_group))
   10538      5095393 :             ++current_group;
   10539              :         }
   10540              :       /* New BBs always start a new DR group.  */
   10541     10667894 :       ++current_group;
   10542              :     }
   10543              : 
   10544      1877972 :   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
   10545      1877972 : }
   10546              : 
   10547              : /* Special entry for the BB vectorizer.  Analyze and transform a single
   10548              :    if-converted BB with ORIG_LOOPs body being the not if-converted
   10549              :    representation.  Returns true if anything in the basic-block was
   10550              :    vectorized.  */
   10551              : 
   10552              : bool
   10553        19383 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
   10554              : {
   10555        19383 :   auto_vec<basic_block> bbs;
   10556        19383 :   bbs.safe_push (bb);
   10557        19383 :   return vect_slp_bbs (bbs, orig_loop);
   10558        19383 : }
   10559              : 
   10560              : /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
   10561              :    true if anything in the basic-block was vectorized.  */
   10562              : 
   10563              : bool
   10564       909169 : vect_slp_function (function *fun)
   10565              : {
   10566       909169 :   bool r = false;
   10567       909169 :   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
   10568       909169 :   auto_bitmap exit_bbs;
   10569       909169 :   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
   10570       909169 :   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
   10571       909169 :   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
   10572       909169 :                                                       true, rpo, NULL);
   10573              : 
   10574              :   /* For the moment split the function into pieces to avoid making
   10575              :      the iteration on the vector mode moot.  Split at points we know
   10576              :      to not handle well which is CFG merges (SLP discovery doesn't
   10577              :      handle non-loop-header PHIs) and loop exits.  Since pattern
   10578              :      recog requires reverse iteration to visit uses before defs
   10579              :      simply chop RPO into pieces.  */
   10580       909169 :   auto_vec<basic_block> bbs;
   10581     11588604 :   for (unsigned i = 0; i < n; i++)
   10582              :     {
   10583     10679435 :       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
   10584     10679435 :       bool split = false;
   10585              : 
   10586              :       /* Split when a BB is not dominated by the first block.  */
   10587     20143687 :       if (!bbs.is_empty ()
   10588      9464252 :           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
   10589              :         {
   10590       663332 :           if (dump_enabled_p ())
   10591          146 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10592              :                              "splitting region at dominance boundary bb%d\n",
   10593              :                              bb->index);
   10594              :           split = true;
   10595              :         }
   10596              :       /* Split when the loop determined by the first block
   10597              :          is exited.  This is because we eventually insert
   10598              :          invariants at region begin.  */
   10599     18817023 :       else if (!bbs.is_empty ()
   10600      8800920 :                && bbs[0]->loop_father != bb->loop_father
   10601      2286617 :                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
   10602              :         {
   10603         3731 :           if (dump_enabled_p ())
   10604            6 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10605              :                              "splitting region at loop %d exit at bb%d\n",
   10606            3 :                              bbs[0]->loop_father->num, bb->index);
   10607              :           split = true;
   10608              :         }
   10609     10012372 :       else if (!bbs.is_empty ()
   10610      8797189 :                && bb->loop_father->header == bb
   10611       473774 :                && bb->loop_father->dont_vectorize)
   10612              :         {
   10613         7267 :           if (dump_enabled_p ())
   10614           72 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10615              :                              "splitting region at dont-vectorize loop %d "
   10616              :                              "entry at bb%d\n",
   10617              :                              bb->loop_father->num, bb->index);
   10618              :           split = true;
   10619              :         }
   10620              : 
   10621     11353765 :       if (split && !bbs.is_empty ())
   10622              :         {
   10623       674330 :           r |= vect_slp_bbs (bbs, NULL);
   10624       674330 :           bbs.truncate (0);
   10625              :         }
   10626              : 
   10627     10679435 :       if (bbs.is_empty ())
   10628              :         {
   10629              :           /* We need to be able to insert at the head of the region which
   10630              :              we cannot for region starting with a returns-twice call.  */
   10631      1889513 :           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
   10632       400750 :             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
   10633              :               {
   10634          301 :                 if (dump_enabled_p ())
   10635            2 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10636              :                                    "skipping bb%d as start of region as it "
   10637              :                                    "starts with returns-twice call\n",
   10638              :                                    bb->index);
   10639        30924 :                 continue;
   10640              :               }
   10641              :           /* If the loop this BB belongs to is marked as not to be vectorized
   10642              :              honor that also for BB vectorization.  */
   10643      1889212 :           if (bb->loop_father->dont_vectorize)
   10644        30623 :             continue;
   10645              :         }
   10646              : 
   10647     10648511 :       bbs.safe_push (bb);
   10648              : 
   10649              :       /* When we have a stmt ending this block and defining a
   10650              :          value we have to insert on edges when inserting after it for
   10651              :          a vector containing its definition.  Avoid this for now.  */
   10652     21297022 :       if (gimple *last = *gsi_last_bb (bb))
   10653      8613944 :         if (gimple_get_lhs (last)
   10654      8613944 :             && is_ctrl_altering_stmt (last))
   10655              :           {
   10656       275097 :             if (dump_enabled_p ())
   10657            2 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10658              :                                "splitting region at control altering "
   10659              :                                "definition %G", last);
   10660       275097 :             r |= vect_slp_bbs (bbs, NULL);
   10661       275097 :             bbs.truncate (0);
   10662              :           }
   10663              :     }
   10664              : 
   10665       909169 :   if (!bbs.is_empty ())
   10666       909162 :     r |= vect_slp_bbs (bbs, NULL);
   10667              : 
   10668       909169 :   free (rpo);
   10669              : 
   10670       909169 :   return r;
   10671       909169 : }
   10672              : 
   10673              : /* Build a variable-length vector in which the elements in ELTS are repeated
   10674              :    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
   10675              :    RESULTS and add any new instructions to SEQ.
   10676              : 
   10677              :    The approach we use is:
   10678              : 
   10679              :    (1) Find a vector mode VM with integer elements of mode IM.
   10680              : 
   10681              :    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10682              :        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
   10683              :        from small vectors to IM.
   10684              : 
   10685              :    (3) Duplicate each ELTS'[I] into a vector of mode VM.
   10686              : 
   10687              :    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
   10688              :        correct byte contents.
   10689              : 
   10690              :    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
   10691              : 
   10692              :    We try to find the largest IM for which this sequence works, in order
   10693              :    to cut down on the number of interleaves.  */
   10694              : 
   10695              : void
   10696            0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
   10697              :                           const vec<tree> &elts, unsigned int nresults,
   10698              :                           vec<tree> &results)
   10699              : {
   10700            0 :   unsigned int nelts = elts.length ();
   10701            0 :   tree element_type = TREE_TYPE (vector_type);
   10702              : 
   10703              :   /* (1) Find a vector mode VM with integer elements of mode IM.  */
   10704            0 :   unsigned int nvectors = 1;
   10705            0 :   tree new_vector_type;
   10706            0 :   tree permutes[2];
   10707            0 :   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
   10708              :                                        &nvectors, &new_vector_type,
   10709              :                                        permutes))
   10710            0 :     gcc_unreachable ();
   10711              : 
   10712              :   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
   10713            0 :   unsigned int partial_nelts = nelts / nvectors;
   10714            0 :   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
   10715              : 
   10716            0 :   tree_vector_builder partial_elts;
   10717            0 :   auto_vec<tree, 32> pieces (nvectors * 2);
   10718            0 :   pieces.quick_grow_cleared (nvectors * 2);
   10719            0 :   for (unsigned int i = 0; i < nvectors; ++i)
   10720              :     {
   10721              :       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10722              :              ELTS' has mode IM.  */
   10723            0 :       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
   10724            0 :       for (unsigned int j = 0; j < partial_nelts; ++j)
   10725            0 :         partial_elts.quick_push (elts[i * partial_nelts + j]);
   10726            0 :       tree t = gimple_build_vector (seq, &partial_elts);
   10727            0 :       t = gimple_build (seq, VIEW_CONVERT_EXPR,
   10728            0 :                         TREE_TYPE (new_vector_type), t);
   10729              : 
   10730              :       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
   10731            0 :       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
   10732              :     }
   10733              : 
   10734              :   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
   10735              :          correct byte contents.
   10736              : 
   10737              :      Conceptually, we need to repeat the following operation log2(nvectors)
   10738              :      times, where hi_start = nvectors / 2:
   10739              : 
   10740              :         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
   10741              :         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
   10742              : 
   10743              :      However, if each input repeats every N elements and the VF is
   10744              :      a multiple of N * 2, the HI result is the same as the LO result.
   10745              :      This will be true for the first N1 iterations of the outer loop,
   10746              :      followed by N2 iterations for which both the LO and HI results
   10747              :      are needed.  I.e.:
   10748              : 
   10749              :         N1 + N2 = log2(nvectors)
   10750              : 
   10751              :      Each "N1 iteration" doubles the number of redundant vectors and the
   10752              :      effect of the process as a whole is to have a sequence of nvectors/2**N1
   10753              :      vectors that repeats 2**N1 times.  Rather than generate these redundant
   10754              :      vectors, we halve the number of vectors for each N1 iteration.  */
   10755              :   unsigned int in_start = 0;
   10756              :   unsigned int out_start = nvectors;
   10757              :   unsigned int new_nvectors = nvectors;
   10758            0 :   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
   10759              :     {
   10760            0 :       unsigned int hi_start = new_nvectors / 2;
   10761            0 :       unsigned int out_i = 0;
   10762            0 :       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
   10763              :         {
   10764            0 :           if ((in_i & 1) != 0
   10765            0 :               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
   10766              :                              2 * in_repeat))
   10767            0 :             continue;
   10768              : 
   10769            0 :           tree output = make_ssa_name (new_vector_type);
   10770            0 :           tree input1 = pieces[in_start + (in_i / 2)];
   10771            0 :           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
   10772            0 :           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
   10773              :                                                input1, input2,
   10774              :                                                permutes[in_i & 1]);
   10775            0 :           gimple_seq_add_stmt (seq, stmt);
   10776            0 :           pieces[out_start + out_i] = output;
   10777            0 :           out_i += 1;
   10778              :         }
   10779            0 :       std::swap (in_start, out_start);
   10780            0 :       new_nvectors = out_i;
   10781              :     }
   10782              : 
   10783              :   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
   10784            0 :   results.reserve (nresults);
   10785            0 :   for (unsigned int i = 0; i < nresults; ++i)
   10786            0 :     if (i < new_nvectors)
   10787            0 :       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
   10788            0 :                                         pieces[in_start + i]));
   10789              :     else
   10790            0 :       results.quick_push (results[i - new_nvectors]);
   10791            0 : }
   10792              : 
   10793              : 
   10794              : /* For constant and loop invariant defs in OP_NODE this function creates
   10795              :    vector defs that will be used in the vectorized stmts and stores them
   10796              :    to SLP_TREE_VEC_DEFS of OP_NODE.  */
   10797              : 
   10798              : static void
   10799       490000 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
   10800              : {
   10801       490000 :   unsigned HOST_WIDE_INT nunits;
   10802       490000 :   tree vec_cst;
   10803       490000 :   unsigned j, number_of_places_left_in_vector;
   10804       490000 :   tree vector_type;
   10805       490000 :   tree vop;
   10806       490000 :   int group_size = op_node->ops.length ();
   10807       490000 :   unsigned int vec_num, i;
   10808       490000 :   unsigned number_of_copies = 1;
   10809       490000 :   bool constant_p;
   10810       490000 :   gimple_seq ctor_seq = NULL;
   10811       490000 :   auto_vec<tree, 16> permute_results;
   10812              : 
   10813              :   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
   10814       490000 :   vector_type = SLP_TREE_VECTYPE (op_node);
   10815              : 
   10816       490000 :   unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
   10817       490000 :   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
   10818       490000 :   auto_vec<tree> voprnds (number_of_vectors);
   10819              : 
   10820              :   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
   10821              :      created vectors. It is greater than 1 if unrolling is performed.
   10822              : 
   10823              :      For example, we have two scalar operands, s1 and s2 (e.g., group of
   10824              :      strided accesses of size two), while NUNITS is four (i.e., four scalars
   10825              :      of this type can be packed in a vector).  The output vector will contain
   10826              :      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
   10827              :      will be 2).
   10828              : 
   10829              :      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
   10830              :      containing the operands.
   10831              : 
   10832              :      For example, NUNITS is four as before, and the group size is 8
   10833              :      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
   10834              :      {s5, s6, s7, s8}.  */
   10835              : 
   10836              :   /* When using duplicate_and_interleave, we just need one element for
   10837              :      each scalar statement.  */
   10838       490000 :   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
   10839              :     nunits = group_size;
   10840              : 
   10841       490000 :   number_of_copies = nunits * number_of_vectors / group_size;
   10842              : 
   10843       490000 :   number_of_places_left_in_vector = nunits;
   10844       490000 :   constant_p = true;
   10845       490000 :   tree uniform_elt = NULL_TREE;
   10846       490000 :   tree_vector_builder elts (vector_type, nunits, 1);
   10847       490000 :   elts.quick_grow (nunits);
   10848       490000 :   stmt_vec_info insert_after = NULL;
   10849      1466692 :   for (j = 0; j < number_of_copies; j++)
   10850              :     {
   10851       976692 :       tree op;
   10852      3740292 :       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
   10853              :         {
   10854              :           /* Create 'vect_ = {op0,op1,...,opn}'.  */
   10855      1786908 :           tree orig_op = op;
   10856      1786908 :           if (number_of_places_left_in_vector == nunits)
   10857              :             uniform_elt = op;
   10858      1167974 :           else if (uniform_elt && operand_equal_p (uniform_elt, op))
   10859       745234 :             op = elts[number_of_places_left_in_vector];
   10860              :           else
   10861              :             uniform_elt = NULL_TREE;
   10862      1786908 :           number_of_places_left_in_vector--;
   10863      1786908 :           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
   10864              :             {
   10865       274777 :               if (CONSTANT_CLASS_P (op))
   10866              :                 {
   10867       100349 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10868              :                     {
   10869              :                       /* Can't use VIEW_CONVERT_EXPR for booleans because
   10870              :                          of possibly different sizes of scalar value and
   10871              :                          vector element.  */
   10872           51 :                       if (integer_zerop (op))
   10873           51 :                         op = build_int_cst (TREE_TYPE (vector_type), 0);
   10874            0 :                       else if (integer_onep (op))
   10875            0 :                         op = build_all_ones_cst (TREE_TYPE (vector_type));
   10876              :                       else
   10877            0 :                         gcc_unreachable ();
   10878              :                     }
   10879              :                   else
   10880       100298 :                     op = fold_unary (VIEW_CONVERT_EXPR,
   10881              :                                      TREE_TYPE (vector_type), op);
   10882       100349 :                   gcc_assert (op && CONSTANT_CLASS_P (op));
   10883              :                 }
   10884              :               else
   10885              :                 {
   10886       174428 :                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
   10887       174428 :                   gimple *init_stmt;
   10888       174428 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10889              :                     {
   10890          403 :                       tree true_val
   10891          403 :                         = build_all_ones_cst (TREE_TYPE (vector_type));
   10892          403 :                       tree false_val
   10893          403 :                         = build_zero_cst (TREE_TYPE (vector_type));
   10894          403 :                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
   10895          403 :                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
   10896              :                                                        op, true_val,
   10897              :                                                        false_val);
   10898              :                     }
   10899              :                   else
   10900              :                     {
   10901       174025 :                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
   10902              :                                    op);
   10903       174025 :                       init_stmt
   10904       174025 :                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
   10905              :                                                op);
   10906              :                     }
   10907       174428 :                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
   10908       174428 :                   op = new_temp;
   10909              :                 }
   10910              :             }
   10911      1786908 :           elts[number_of_places_left_in_vector] = op;
   10912      1786908 :           if (!CONSTANT_CLASS_P (op))
   10913       316386 :             constant_p = false;
   10914              :           /* For BB vectorization we have to compute an insert location
   10915              :              when a def is inside the analyzed region since we cannot
   10916              :              simply insert at the BB start in this case.  */
   10917      1786908 :           stmt_vec_info opdef;
   10918      1786908 :           if (TREE_CODE (orig_op) == SSA_NAME
   10919       181541 :               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
   10920       161741 :               && is_a <bb_vec_info> (vinfo)
   10921      1890826 :               && (opdef = vinfo->lookup_def (orig_op)))
   10922              :             {
   10923        85209 :               if (!insert_after)
   10924              :                 insert_after = opdef;
   10925              :               else
   10926        47059 :                 insert_after = get_later_stmt (insert_after, opdef);
   10927              :             }
   10928              : 
   10929      1786908 :           if (number_of_places_left_in_vector == 0)
   10930              :             {
   10931       618934 :               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
   10932       618934 :               if (uniform_elt)
   10933       646930 :                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
   10934       323465 :                                                         elts[0]);
   10935       590938 :               else if (constant_p
   10936       590938 :                        ? multiple_p (type_nunits, nunits)
   10937       109163 :                        : known_eq (type_nunits, nunits))
   10938       295469 :                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
   10939              :               else
   10940              :                 {
   10941            0 :                   if (permute_results.is_empty ())
   10942            0 :                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
   10943              :                                               elts, number_of_vectors,
   10944              :                                               permute_results);
   10945            0 :                   vec_cst = permute_results[number_of_vectors - j - 1];
   10946              :                 }
   10947       618934 :               if (!gimple_seq_empty_p (ctor_seq))
   10948              :                 {
   10949       136314 :                   if (insert_after)
   10950              :                     {
   10951        38150 :                       gimple_stmt_iterator gsi;
   10952        38150 :                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
   10953              :                         {
   10954          614 :                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
   10955          614 :                           gsi_insert_seq_before (&gsi, ctor_seq,
   10956              :                                                  GSI_CONTINUE_LINKING);
   10957              :                         }
   10958        37536 :                       else if (!stmt_ends_bb_p (insert_after->stmt))
   10959              :                         {
   10960        37536 :                           gsi = gsi_for_stmt (insert_after->stmt);
   10961        37536 :                           gsi_insert_seq_after (&gsi, ctor_seq,
   10962              :                                                 GSI_CONTINUE_LINKING);
   10963              :                         }
   10964              :                       else
   10965              :                         {
   10966              :                           /* When we want to insert after a def where the
   10967              :                              defining stmt throws then insert on the fallthru
   10968              :                              edge.  */
   10969            0 :                           edge e = find_fallthru_edge
   10970            0 :                                      (gimple_bb (insert_after->stmt)->succs);
   10971            0 :                           basic_block new_bb
   10972            0 :                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
   10973            0 :                           gcc_assert (!new_bb);
   10974              :                         }
   10975              :                     }
   10976              :                   else
   10977        98164 :                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
   10978       136314 :                   ctor_seq = NULL;
   10979              :                 }
   10980       618934 :               voprnds.quick_push (vec_cst);
   10981       618934 :               insert_after = NULL;
   10982       618934 :               number_of_places_left_in_vector = nunits;
   10983       618934 :               constant_p = true;
   10984       618934 :               elts.new_vector (vector_type, nunits, 1);
   10985       618934 :               elts.quick_grow (nunits);
   10986              :             }
   10987              :         }
   10988              :     }
   10989              : 
   10990              :   /* Since the vectors are created in the reverse order, we should invert
   10991              :      them.  */
   10992       490000 :   vec_num = voprnds.length ();
   10993      1108934 :   for (j = vec_num; j != 0; j--)
   10994              :     {
   10995       618934 :       vop = voprnds[j - 1];
   10996       618934 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   10997              :     }
   10998              : 
   10999              :   /* In case that VF is greater than the unrolling factor needed for the SLP
   11000              :      group of stmts, NUMBER_OF_VECTORS to be created is greater than
   11001              :      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
   11002              :      to replicate the vectors.  */
   11003       490000 :   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
   11004       490000 :     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
   11005              :          i++)
   11006            0 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   11007       490000 : }
   11008              : 
   11009              : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
   11010              :    if there is no definition for it in the scalar IL or it is not known.  */
   11011              : 
   11012              : tree
   11013         1909 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
   11014              : {
   11015         1909 :   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
   11016              :     {
   11017         1899 :       if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
   11018              :         return NULL_TREE;
   11019         1899 :       stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
   11020         1899 :       if (!def)
   11021              :         return NULL_TREE;
   11022         1899 :       return gimple_get_lhs (STMT_VINFO_STMT (def));
   11023              :     }
   11024              :   else
   11025           10 :     return SLP_TREE_SCALAR_OPS (slp_node)[n];
   11026              : }
   11027              : 
   11028              : /* Get the Ith vectorized definition from SLP_NODE.  */
   11029              : 
   11030              : tree
   11031       145845 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
   11032              : {
   11033       145845 :   return SLP_TREE_VEC_DEFS (slp_node)[i];
   11034              : }
   11035              : 
   11036              : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
   11037              : 
   11038              : void
   11039       925642 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
   11040              : {
   11041      1851284 :   vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
   11042       925642 :   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
   11043       925642 : }
   11044              : 
   11045              : /* Get N vectorized definitions for SLP_NODE.  */
   11046              : 
   11047              : void
   11048         2955 : vect_get_slp_defs (vec_info *,
   11049              :                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
   11050              : {
   11051         2955 :   if (n == -1U)
   11052         2955 :     n = SLP_TREE_CHILDREN (slp_node).length ();
   11053              : 
   11054        10648 :   for (unsigned i = 0; i < n; ++i)
   11055              :     {
   11056         7693 :       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
   11057         7693 :       vec<tree> vec_defs = vNULL;
   11058         7693 :       vect_get_slp_defs (child, &vec_defs);
   11059         7693 :       vec_oprnds->quick_push (vec_defs);
   11060              :     }
   11061         2955 : }
   11062              : 
   11063              : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
   11064              :    - PERM gives the permutation that the caller wants to use for NODE,
   11065              :      which might be different from SLP_LOAD_PERMUTATION.
   11066              :    - DUMP_P controls whether the function dumps information.  */
   11067              : 
   11068              : static bool
   11069       125915 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   11070              :                                 load_permutation_t &perm,
   11071              :                                 const vec<tree> &dr_chain,
   11072              :                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
   11073              :                                 bool analyze_only, bool dump_p,
   11074              :                                 unsigned *n_perms, unsigned int *n_loads,
   11075              :                                 bool dce_chain)
   11076              : {
   11077       125915 :   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   11078       125915 :   int vec_index = 0;
   11079       125915 :   tree vectype = SLP_TREE_VECTYPE (node);
   11080       125915 :   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   11081       125915 :   unsigned int mask_element;
   11082       125915 :   unsigned dr_group_size;
   11083       125915 :   machine_mode mode;
   11084              : 
   11085       125915 :   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
   11086              :     {
   11087              :       /* We have both splats of the same non-grouped load and groups
   11088              :          of distinct invariant loads entering here.  */
   11089         1205 :       unsigned max_idx = 0;
   11090         6793 :       for (auto idx : perm)
   11091         3178 :         max_idx = idx > max_idx ? idx : max_idx;
   11092         1205 :       dr_group_size = max_idx + 1;
   11093              :     }
   11094              :   else
   11095              :     {
   11096       124710 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
   11097       124710 :       dr_group_size = DR_GROUP_SIZE (stmt_info);
   11098              :     }
   11099              : 
   11100       125915 :   mode = TYPE_MODE (vectype);
   11101       125915 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   11102       125915 :   unsigned int nstmts = vect_get_num_copies (vinfo, node);
   11103              : 
   11104              :   /* Initialize the vect stmts of NODE to properly insert the generated
   11105              :      stmts later.  */
   11106       125915 :   if (! analyze_only)
   11107        57661 :     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
   11108        22219 :       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
   11109              : 
   11110              :   /* Generate permutation masks for every NODE. Number of masks for each NODE
   11111              :      is equal to GROUP_SIZE.
   11112              :      E.g., we have a group of three nodes with three loads from the same
   11113              :      location in each node, and the vector size is 4. I.e., we have a
   11114              :      a0b0c0a1b1c1... sequence and we need to create the following vectors:
   11115              :      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
   11116              :      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
   11117              :      ...
   11118              : 
   11119              :      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
   11120              :      The last mask is illegal since we assume two operands for permute
   11121              :      operation, and the mask element values can't be outside that range.
   11122              :      Hence, the last mask must be converted into {2,5,5,5}.
   11123              :      For the first two permutations we need the first and the second input
   11124              :      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
   11125              :      we need the second and the third vectors: {b1,c1,a2,b2} and
   11126              :      {c2,a3,b3,c3}.  */
   11127              : 
   11128       125915 :   int vect_stmts_counter = 0;
   11129       125915 :   unsigned int index = 0;
   11130       125915 :   int first_vec_index = -1;
   11131       125915 :   int second_vec_index = -1;
   11132       125915 :   bool noop_p = true;
   11133       125915 :   *n_perms = 0;
   11134              : 
   11135       125915 :   vec_perm_builder mask;
   11136       125915 :   unsigned int nelts_to_build;
   11137       125915 :   unsigned int nvectors_per_build;
   11138       125915 :   unsigned int in_nlanes;
   11139       125915 :   bool repeating_p = (group_size == dr_group_size
   11140       157546 :                       && multiple_p (nunits, group_size));
   11141       125915 :   if (repeating_p)
   11142              :     {
   11143              :       /* A single vector contains a whole number of copies of the node, so:
   11144              :          (a) all permutes can use the same mask; and
   11145              :          (b) the permutes only need a single vector input.  */
   11146        29464 :       mask.new_vector (nunits, group_size, 3);
   11147        29464 :       nelts_to_build = mask.encoded_nelts ();
   11148              :       /* It's possible to obtain zero nstmts during analyze_only, so make
   11149              :          it at least one to ensure the later computation for n_perms
   11150              :          proceed.  */
   11151        29464 :       nvectors_per_build = nstmts > 0 ? nstmts : 1;
   11152        29464 :       in_nlanes = dr_group_size * 3;
   11153              :     }
   11154              :   else
   11155              :     {
   11156              :       /* We need to construct a separate mask for each vector statement.  */
   11157        96451 :       unsigned HOST_WIDE_INT const_nunits, const_vf;
   11158        96451 :       if (!nunits.is_constant (&const_nunits)
   11159        96451 :           || !vf.is_constant (&const_vf))
   11160              :         return false;
   11161        96451 :       mask.new_vector (const_nunits, const_nunits, 1);
   11162        96451 :       nelts_to_build = const_vf * group_size;
   11163        96451 :       nvectors_per_build = 1;
   11164        96451 :       in_nlanes = const_vf * dr_group_size;
   11165              :     }
   11166       125915 :   auto_sbitmap used_in_lanes (in_nlanes);
   11167       125915 :   bitmap_clear (used_in_lanes);
   11168       125915 :   auto_bitmap used_defs;
   11169              : 
   11170       125915 :   unsigned int count = mask.encoded_nelts ();
   11171       125915 :   mask.quick_grow (count);
   11172       125915 :   vec_perm_indices indices;
   11173              : 
   11174       660604 :   for (unsigned int j = 0; j < nelts_to_build; j++)
   11175              :     {
   11176       544566 :       unsigned int iter_num = j / group_size;
   11177       544566 :       unsigned int stmt_num = j % group_size;
   11178       544566 :       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
   11179       544566 :       bitmap_set_bit (used_in_lanes, i);
   11180       544566 :       if (repeating_p)
   11181              :         {
   11182              :           first_vec_index = 0;
   11183              :           mask_element = i;
   11184              :         }
   11185              :       else
   11186              :         {
   11187              :           /* Enforced before the loop when !repeating_p.  */
   11188       354684 :           unsigned int const_nunits = nunits.to_constant ();
   11189       354684 :           vec_index = i / const_nunits;
   11190       354684 :           mask_element = i % const_nunits;
   11191       354684 :           if (vec_index == first_vec_index
   11192       354684 :               || first_vec_index == -1)
   11193              :             {
   11194              :               first_vec_index = vec_index;
   11195              :             }
   11196       140242 :           else if (vec_index == second_vec_index
   11197       140242 :                    || second_vec_index == -1)
   11198              :             {
   11199       133838 :               second_vec_index = vec_index;
   11200       133838 :               mask_element += const_nunits;
   11201              :             }
   11202              :           else
   11203              :             {
   11204         6404 :               if (dump_p)
   11205          366 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11206              :                                  "permutation requires at "
   11207              :                                  "least three vectors %G",
   11208              :                                  stmt_info->stmt);
   11209         6404 :               gcc_assert (analyze_only);
   11210              :               return false;
   11211              :             }
   11212              : 
   11213       348280 :           gcc_assert (mask_element < 2 * const_nunits);
   11214              :         }
   11215              : 
   11216       538162 :       if (mask_element != index)
   11217       339363 :         noop_p = false;
   11218       538162 :       mask[index++] = mask_element;
   11219              : 
   11220       538162 :       if (index == count)
   11221              :         {
   11222       153949 :           if (!noop_p)
   11223              :             {
   11224       207086 :               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
   11225       121438 :               if (!can_vec_perm_const_p (mode, mode, indices))
   11226              :                 {
   11227         3473 :                   if (dump_p)
   11228              :                     {
   11229           79 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11230              :                                        "unsupported vect permute { ");
   11231          669 :                       for (i = 0; i < count; ++i)
   11232              :                         {
   11233          590 :                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11234          590 :                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11235              :                         }
   11236           79 :                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11237              :                     }
   11238         3473 :                   gcc_assert (analyze_only);
   11239              :                   return false;
   11240              :                 }
   11241              : 
   11242       117965 :               tree mask_vec = NULL_TREE;
   11243       117965 :               if (!analyze_only)
   11244        20579 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11245              : 
   11246       117965 :               if (second_vec_index == -1)
   11247        33884 :                 second_vec_index = first_vec_index;
   11248              : 
   11249       237921 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11250              :                 {
   11251       119956 :                   ++*n_perms;
   11252       119956 :                   if (analyze_only)
   11253        99094 :                     continue;
   11254              :                   /* Generate the permute statement if necessary.  */
   11255        20862 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11256        20862 :                   tree second_vec = dr_chain[second_vec_index + ri];
   11257        20862 :                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
   11258        20862 :                   tree perm_dest
   11259        20862 :                     = vect_create_destination_var (gimple_assign_lhs (stmt),
   11260              :                                                    vectype);
   11261        20862 :                   perm_dest = make_ssa_name (perm_dest);
   11262        20862 :                   gimple *perm_stmt
   11263        20862 :                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
   11264              :                                            second_vec, mask_vec);
   11265        20862 :                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
   11266              :                                                gsi);
   11267        20862 :                   if (dce_chain)
   11268              :                     {
   11269        20093 :                       bitmap_set_bit (used_defs, first_vec_index + ri);
   11270        20093 :                       bitmap_set_bit (used_defs, second_vec_index + ri);
   11271              :                     }
   11272              : 
   11273              :                   /* Store the vector statement in NODE.  */
   11274        20862 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
   11275              :                 }
   11276              :             }
   11277        32511 :           else if (!analyze_only)
   11278              :             {
   11279         2714 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11280              :                 {
   11281         1357 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11282              :                   /* If mask was NULL_TREE generate the requested
   11283              :                      identity transform.  */
   11284         1357 :                   if (dce_chain)
   11285         1356 :                     bitmap_set_bit (used_defs, first_vec_index + ri);
   11286              : 
   11287              :                   /* Store the vector statement in NODE.  */
   11288         1357 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
   11289              :                 }
   11290              :             }
   11291              : 
   11292              :           index = 0;
   11293              :           first_vec_index = -1;
   11294              :           second_vec_index = -1;
   11295              :           noop_p = true;
   11296              :         }
   11297              :     }
   11298              : 
   11299       116038 :   if (n_loads)
   11300              :     {
   11301        81738 :       if (repeating_p)
   11302        10396 :         *n_loads = nstmts;
   11303              :       else
   11304              :         {
   11305              :           /* Enforced above when !repeating_p.  */
   11306        71342 :           unsigned int const_nunits = nunits.to_constant ();
   11307        71342 :           *n_loads = 0;
   11308        71342 :           bool load_seen = false;
   11309       997036 :           for (unsigned i = 0; i < in_nlanes; ++i)
   11310              :             {
   11311       925694 :               if (i % const_nunits == 0)
   11312              :                 {
   11313       407271 :                   if (load_seen)
   11314       122397 :                     *n_loads += 1;
   11315              :                   load_seen = false;
   11316              :                 }
   11317       925694 :               if (bitmap_bit_p (used_in_lanes, i))
   11318       252217 :                 load_seen = true;
   11319              :             }
   11320        71342 :           if (load_seen)
   11321        42922 :             *n_loads += 1;
   11322              :         }
   11323              :     }
   11324              : 
   11325       116038 :   if (dce_chain)
   11326       215123 :     for (unsigned i = 0; i < dr_chain.length (); ++i)
   11327        72108 :       if (!bitmap_bit_p (used_defs, i))
   11328              :         {
   11329        39350 :           tree def = dr_chain[i];
   11330        39685 :           do
   11331              :             {
   11332        39685 :               gimple *stmt = SSA_NAME_DEF_STMT (def);
   11333        39685 :               if (is_gimple_assign (stmt)
   11334        39685 :                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
   11335        39685 :                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
   11336         4916 :                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
   11337              :               else
   11338              :                 def = NULL;
   11339        39685 :               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
   11340        39685 :               gsi_remove (&rgsi, true);
   11341        39685 :               release_defs (stmt);
   11342              :             }
   11343        39685 :           while (def);
   11344              :         }
   11345              : 
   11346              :   return true;
   11347       125915 : }
   11348              : 
   11349              : /* Generate vector permute statements from a list of loads in DR_CHAIN.
   11350              :    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
   11351              :    permute statements for the SLP node NODE.  Store the number of vector
   11352              :    permute instructions in *N_PERMS and the number of vector load
   11353              :    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
   11354              :    that were not needed.  */
   11355              : 
   11356              : bool
   11357        90755 : vect_transform_slp_perm_load (vec_info *vinfo,
   11358              :                               slp_tree node, const vec<tree> &dr_chain,
   11359              :                               gimple_stmt_iterator *gsi, poly_uint64 vf,
   11360              :                               bool analyze_only, unsigned *n_perms,
   11361              :                               unsigned int *n_loads, bool dce_chain)
   11362              : {
   11363        90755 :   return vect_transform_slp_perm_load_1 (vinfo, node,
   11364        90755 :                                          SLP_TREE_LOAD_PERMUTATION (node),
   11365              :                                          dr_chain, gsi, vf, analyze_only,
   11366              :                                          dump_enabled_p (), n_perms, n_loads,
   11367        90755 :                                          dce_chain);
   11368              : }
   11369              : 
   11370              : /* Produce the next vector result for SLP permutation NODE by adding a vector
   11371              :    statement at GSI.  If MASK_VEC is nonnull, add:
   11372              : 
   11373              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
   11374              : 
   11375              :    otherwise add:
   11376              : 
   11377              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
   11378              :                                       { N, N+1, N+2, ... }>
   11379              : 
   11380              :    where N == IDENTITY_OFFSET which is either zero or equal to the
   11381              :    number of elements of the result.  */
   11382              : 
   11383              : static void
   11384        31376 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11385              :                           slp_tree node, tree first_def, tree second_def,
   11386              :                           tree mask_vec, poly_uint64 identity_offset)
   11387              : {
   11388        31376 :   tree vectype = SLP_TREE_VECTYPE (node);
   11389              : 
   11390              :   /* ???  We SLP match existing vector element extracts but
   11391              :      allow punning which we need to re-instantiate at uses
   11392              :      but have no good way of explicitly representing.  */
   11393        31376 :   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
   11394        31376 :       && !types_compatible_p (TREE_TYPE (first_def), vectype))
   11395              :     {
   11396           14 :       gassign *conv_stmt
   11397           14 :         = gimple_build_assign (make_ssa_name (vectype),
   11398              :                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
   11399           14 :       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11400           14 :       first_def = gimple_assign_lhs (conv_stmt);
   11401              :     }
   11402        31376 :   gassign *perm_stmt;
   11403        31376 :   tree perm_dest = make_ssa_name (vectype);
   11404        31376 :   if (mask_vec)
   11405              :     {
   11406        28100 :       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
   11407        28100 :                            TYPE_SIZE (vectype))
   11408        28100 :           && !types_compatible_p (TREE_TYPE (second_def), vectype))
   11409              :         {
   11410            8 :           gassign *conv_stmt
   11411            8 :             = gimple_build_assign (make_ssa_name (vectype),
   11412              :                                    build1 (VIEW_CONVERT_EXPR,
   11413              :                                            vectype, second_def));
   11414            8 :           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11415            8 :           second_def = gimple_assign_lhs (conv_stmt);
   11416              :         }
   11417        28100 :       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
   11418              :                                        first_def, second_def,
   11419              :                                        mask_vec);
   11420              :     }
   11421              :   else
   11422              :     {
   11423         3276 :       auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
   11424         3276 :       unsigned HOST_WIDE_INT vecno;
   11425         3276 :       poly_uint64 eltno;
   11426         3276 :       if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
   11427              :                             &vecno, &eltno))
   11428              :         gcc_unreachable ();
   11429         3276 :       tree def = vecno & 1 ? second_def : first_def;
   11430         3276 :       if (!types_compatible_p (TREE_TYPE (def), vectype))
   11431              :         {
   11432              :           /* For identity permutes we still need to handle the case
   11433              :              of offsetted extracts or concats.  */
   11434          261 :           unsigned HOST_WIDE_INT c;
   11435          261 :           if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
   11436              :             {
   11437          257 :               unsigned HOST_WIDE_INT elsz
   11438          257 :                 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
   11439          514 :               tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
   11440          257 :                                      TYPE_SIZE (vectype),
   11441          257 :                                      bitsize_int (eltno * elsz));
   11442          257 :               perm_stmt = gimple_build_assign (perm_dest, lowpart);
   11443              :             }
   11444            4 :           else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
   11445            4 :                                         def_nunits, &c) && c == 2)
   11446              :             {
   11447            4 :               gcc_assert (known_eq (identity_offset, 0U));
   11448            4 :               tree ctor = build_constructor_va (vectype, 2,
   11449              :                                                 NULL_TREE, first_def,
   11450              :                                                 NULL_TREE, second_def);
   11451            4 :               perm_stmt = gimple_build_assign (perm_dest, ctor);
   11452              :             }
   11453              :           else
   11454            0 :             gcc_unreachable ();
   11455              :         }
   11456              :       else
   11457              :         {
   11458              :           /* We need a copy here in case the def was external.  */
   11459         3015 :           gcc_assert (known_eq (eltno, 0U));
   11460         3015 :           perm_stmt = gimple_build_assign (perm_dest, def);
   11461              :         }
   11462              :     }
   11463        31376 :   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
   11464              :   /* Store the vector statement in NODE.  */
   11465        31376 :   node->push_vec_def (perm_stmt);
   11466        31376 : }
   11467              : 
   11468              : /* Subroutine of vectorizable_slp_permutation.  Check whether the target
   11469              :    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
   11470              :    If GSI is nonnull, emit the permutation there.
   11471              : 
   11472              :    When GSI is null, the only purpose of NODE is to give properties
   11473              :    of the result, such as the vector type and number of SLP lanes.
   11474              :    The node does not need to be a VEC_PERM_EXPR.
   11475              : 
   11476              :    If the target supports the operation, return the number of individual
   11477              :    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
   11478              :    dump file if DUMP_P is true.  */
   11479              : 
   11480              : static int
   11481       435500 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11482              :                                 slp_tree node, lane_permutation_t &perm,
   11483              :                                 vec<slp_tree> &children, bool dump_p)
   11484              : {
   11485       435500 :   tree vectype = SLP_TREE_VECTYPE (node);
   11486              : 
   11487              :   /* ???  We currently only support all same vector input types
   11488              :      while the SLP IL should really do a concat + select and thus accept
   11489              :      arbitrary mismatches.  */
   11490       435500 :   slp_tree child;
   11491       435500 :   unsigned i;
   11492       435500 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   11493       435500 :   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
   11494              :   /* True if we're permuting a single input of 2N vectors down
   11495              :      to N vectors.  This case doesn't generalize beyond 2 since
   11496              :      VEC_PERM_EXPR only takes 2 inputs.  */
   11497       435500 :   bool pack_p = false;
   11498              :   /* If we're permuting inputs of N vectors each into X*N outputs,
   11499              :      this is the value of X, otherwise it is 1.  */
   11500       435500 :   unsigned int unpack_factor = 1;
   11501       435500 :   tree op_vectype = NULL_TREE;
   11502       436685 :   FOR_EACH_VEC_ELT (children, i, child)
   11503       436606 :     if (SLP_TREE_VECTYPE (child))
   11504              :       {
   11505              :         op_vectype = SLP_TREE_VECTYPE (child);
   11506              :         break;
   11507              :       }
   11508       435500 :   if (!op_vectype)
   11509           79 :     op_vectype = vectype;
   11510       930282 :   FOR_EACH_VEC_ELT (children, i, child)
   11511              :     {
   11512       494782 :       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
   11513        10077 :            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
   11514       494782 :           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
   11515       989564 :           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
   11516              :         {
   11517            0 :           if (dump_p)
   11518            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11519              :                              "Unsupported vector types in lane permutation\n");
   11520            0 :           return -1;
   11521              :         }
   11522       494782 :       auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
   11523       494782 :       unsigned int this_unpack_factor;
   11524              :       /* Detect permutations of external, pre-existing vectors.  The external
   11525              :          node's SLP_TREE_LANES stores the total number of units in the vector,
   11526              :          or zero if the vector has variable length.
   11527              : 
   11528              :          We are expected to keep the original VEC_PERM_EXPR for such cases.
   11529              :          There is no repetition to model.  */
   11530       494782 :       if (SLP_TREE_DEF_TYPE (child) == vect_external_def
   11531       494782 :           && SLP_TREE_SCALAR_OPS (child).is_empty ())
   11532              :         repeating_p = false;
   11533              :       /* Check whether the input has twice as many lanes per vector.  */
   11534       486819 :       else if (children.length () == 1
   11535       486819 :                && known_eq (SLP_TREE_LANES (child) * nunits,
   11536              :                             SLP_TREE_LANES (node) * op_nunits * 2))
   11537              :         pack_p = true;
   11538              :       /* Check whether the output has N times as many lanes per vector.  */
   11539       494782 :       else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
   11540       444222 :                                     SLP_TREE_LANES (child) * nunits,
   11541              :                                     &this_unpack_factor)
   11542       409559 :                && (i == 0 || unpack_factor == this_unpack_factor))
   11543              :         unpack_factor = this_unpack_factor;
   11544              :       else
   11545              :         repeating_p = false;
   11546              :     }
   11547              : 
   11548       871000 :   gcc_assert (perm.length () == SLP_TREE_LANES (node));
   11549              : 
   11550              :   /* Load-lanes permute.  This permute only acts as a forwarder to
   11551              :      select the correct vector def of the load-lanes load which
   11552              :      has the permuted vectors in its vector defs like
   11553              :      { v0, w0, r0, v1, w1, r1 ... } for a ld3.  All costs are
   11554              :      accounted for in the costing for the actual load so we
   11555              :      return zero here.  */
   11556       435500 :   if (node->ldst_lanes)
   11557              :     {
   11558            0 :       gcc_assert (children.length () == 1);
   11559            0 :       if (!gsi)
   11560              :         /* This is a trivial op always supported.  */
   11561              :         return 0;
   11562            0 :       slp_tree child = children[0];
   11563            0 :       unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
   11564            0 :                           / SLP_TREE_LANES (node));
   11565            0 :       unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
   11566            0 :       unsigned nvectors = vect_get_num_copies (vinfo, node);
   11567            0 :       for (unsigned i = 0; i < nvectors; ++i)
   11568              :         {
   11569            0 :           tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
   11570            0 :           node->push_vec_def (def);
   11571              :         }
   11572              :       return 0;
   11573              :     }
   11574              : 
   11575              :   /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
   11576              :      and if we can generate the vectors in a vector-length agnostic way.
   11577              :      This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
   11578              :      compile time.
   11579              : 
   11580              :      The significance of UNPACK_STEP is that, when PACK_P is false,
   11581              :      output vector I operates on a window of UNPACK_STEP elements from each
   11582              :      input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR).  For example,
   11583              :      when UNPACK_FACTOR is 2, the first output vector operates on lanes
   11584              :      [0, NUNITS / 2 - 1] of each input vector and the second output vector
   11585              :      operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
   11586              : 
   11587              :      When REPEATING_P is true, NOUTPUTS holds the total number of outputs
   11588              :      that we actually need to generate.  */
   11589       435500 :   uint64_t noutputs = 0;
   11590       435500 :   poly_uint64 unpack_step = 0;
   11591       435500 :   loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
   11592       148764 :   if (!linfo
   11593       473731 :       || !multiple_p (nunits, unpack_factor, &unpack_step)
   11594       147864 :       || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
   11595       147864 :                                * SLP_TREE_LANES (node), nunits, &noutputs))
   11596              :     repeating_p = false;
   11597              : 
   11598              :   /* We can handle the conditions described for REPEATING_P above for
   11599              :      both variable- and constant-length vectors.  The fallback requires
   11600              :      us to generate every element of every permute vector explicitly,
   11601              :      which is only possible for constant-length permute vectors.
   11602              : 
   11603              :      Set:
   11604              : 
   11605              :      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
   11606              :        mask vectors that we want to build.
   11607              : 
   11608              :      - NCOPIES to the number of copies of PERM that we need in order
   11609              :        to build the necessary permute mask vectors.  */
   11610       147864 :   uint64_t npatterns;
   11611       147864 :   unsigned nelts_per_pattern;
   11612       147864 :   uint64_t ncopies;
   11613       147864 :   if (repeating_p)
   11614              :     {
   11615              :       /* We need permute mask vectors that have the form:
   11616              : 
   11617              :            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
   11618              : 
   11619              :          In other words, the original n-element permute in PERM is
   11620              :          "unrolled" to fill a full vector.  The stepped vector encoding
   11621              :          that we use for permutes requires 3n elements.  */
   11622       109633 :       npatterns = SLP_TREE_LANES (node);
   11623       109633 :       nelts_per_pattern = ncopies = 3;
   11624              :     }
   11625              :   else
   11626              :     {
   11627              :       /* Calculate every element of every permute mask vector explicitly,
   11628              :          instead of relying on the pattern described above.  */
   11629       325867 :       if (!nunits.is_constant (&npatterns)
   11630       325867 :           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
   11631              :         {
   11632              :           if (dump_p)
   11633              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11634              :                              "unsupported permutation %p on variable-length"
   11635              :                              " vectors\n", (void *) node);
   11636              :           return -1;
   11637              :         }
   11638       325867 :       nelts_per_pattern = ncopies = 1;
   11639       325867 :       if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
   11640              :         {
   11641              :           if (dump_p)
   11642              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11643              :                              "unsupported permutation %p for variable VF\n",
   11644              :                              (void *) node);
   11645              :           return -1;
   11646              :         }
   11647              :       pack_p = false;
   11648              :       unpack_factor = 1;
   11649              :     }
   11650       435500 :   unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
   11651       435500 :   gcc_assert (repeating_p || multiple_p (olanes, nunits));
   11652              : 
   11653              :   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
   11654              :      from the { SLP operand, scalar lane } permutation as recorded in the
   11655              :      SLP node as intermediate step.  This part should already work
   11656              :      with SLP children with arbitrary number of lanes.  */
   11657       435500 :   auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
   11658       435500 :   auto_vec<poly_uint64> active_lane;
   11659       435500 :   vperm.create (olanes);
   11660       435500 :   active_lane.safe_grow_cleared (children.length (), true);
   11661       877970 :   for (unsigned int ui = 0; ui < unpack_factor; ++ui)
   11662              :     {
   11663      1902336 :       for (unsigned j = 0; j < children.length (); ++j)
   11664       508698 :         active_lane[j] = ui * unpack_step;
   11665      1217188 :       for (unsigned i = 0; i < ncopies; ++i)
   11666              :         {
   11667      4841970 :           for (unsigned pi = 0; pi < perm.length (); ++pi)
   11668              :             {
   11669      1646267 :               std::pair<unsigned, unsigned> p = perm[pi];
   11670      1646267 :               tree vtype = SLP_TREE_VECTYPE (children[p.first]);
   11671      1646267 :               if (repeating_p)
   11672       626667 :                 vperm.quick_push ({{p.first, 0},
   11673       626667 :                                    p.second + active_lane[p.first]});
   11674              :               else
   11675              :                 {
   11676              :                   /* We checked above that the vectors are constant-length.  */
   11677      1019600 :                   unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
   11678      1019600 :                     .to_constant ();
   11679      1019600 :                   unsigned lane = active_lane[p.first].to_constant ();
   11680      1019600 :                   unsigned vi = (lane + p.second) / vnunits;
   11681      1019600 :                   unsigned vl = (lane + p.second) % vnunits;
   11682      1019600 :                   vperm.quick_push ({{p.first, vi}, vl});
   11683              :                 }
   11684              :             }
   11685              :           /* Advance to the next group.  */
   11686      1668637 :           for (unsigned j = 0; j < children.length (); ++j)
   11687       893919 :             active_lane[j] += SLP_TREE_LANES (children[j]);
   11688              :         }
   11689              :     }
   11690              : 
   11691       435500 :   if (dump_p)
   11692              :     {
   11693         8827 :       dump_printf_loc (MSG_NOTE, vect_location,
   11694              :                        "vectorizing permutation %p", (void *)node);
   11695        31996 :       for (unsigned i = 0; i < perm.length (); ++i)
   11696        23169 :         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
   11697         8827 :       if (repeating_p)
   11698         7427 :         dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
   11699         8827 :       dump_printf (MSG_NOTE, "\n");
   11700         8827 :       dump_printf_loc (MSG_NOTE, vect_location, "as");
   11701        88790 :       for (unsigned i = 0; i < vperm.length (); ++i)
   11702              :         {
   11703        79963 :           if (i != 0
   11704        79963 :               && (repeating_p
   11705        53986 :                   ? multiple_p (i, npatterns)
   11706        59505 :                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
   11707        23952 :             dump_printf (MSG_NOTE, ",");
   11708        79963 :           dump_printf (MSG_NOTE, " vops%u[%u][",
   11709        79963 :                        vperm[i].first.first, vperm[i].first.second);
   11710        79963 :           dump_dec (MSG_NOTE, vperm[i].second);
   11711        79963 :           dump_printf (MSG_NOTE, "]");
   11712              :         }
   11713         8827 :       dump_printf (MSG_NOTE, "\n");
   11714              :     }
   11715              : 
   11716              :   /* We can only handle two-vector permutes, everything else should
   11717              :      be lowered on the SLP level.  The following is closely inspired
   11718              :      by vect_transform_slp_perm_load and is supposed to eventually
   11719              :      replace it.
   11720              :      ???   As intermediate step do code-gen in the SLP tree representation
   11721              :      somehow?  */
   11722       435500 :   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
   11723       435500 :   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
   11724       435500 :   unsigned int index = 0;
   11725       435500 :   poly_uint64 mask_element;
   11726       435500 :   vec_perm_builder mask;
   11727       435500 :   mask.new_vector (nunits, npatterns, nelts_per_pattern);
   11728       435500 :   unsigned int count = mask.encoded_nelts ();
   11729       435500 :   mask.quick_grow (count);
   11730       435500 :   vec_perm_indices indices;
   11731       435500 :   unsigned nperms = 0;
   11732              :   /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
   11733              :      vectors to check during analysis, but we need to generate NOUTPUTS
   11734              :      vectors during transformation.  */
   11735       435500 :   unsigned total_nelts = olanes;
   11736       435500 :   unsigned process_nelts = olanes;
   11737       435500 :   if (repeating_p)
   11738              :     {
   11739       109633 :       total_nelts = (total_nelts / unpack_factor) * noutputs;
   11740       109633 :       if (gsi)
   11741         9879 :         process_nelts = total_nelts;
   11742              :     }
   11743       435500 :   unsigned last_ei = (total_nelts - 1) % process_nelts;
   11744      2091053 :   for (unsigned i = 0; i < process_nelts; ++i)
   11745              :     {
   11746              :       /* VI is the input vector index when generating code for REPEATING_P.  */
   11747      1663235 :       unsigned vi = i / olanes * (pack_p ? 2 : 1);
   11748      1663235 :       unsigned ei = i % olanes;
   11749      1663235 :       mask_element = vperm[ei].second;
   11750      1663235 :       if (pack_p)
   11751              :         {
   11752              :           /* In this case, we have N outputs and the single child provides 2N
   11753              :              inputs.  Output X permutes inputs 2X and 2X+1.
   11754              : 
   11755              :              The mask indices are taken directly from the SLP permutation node.
   11756              :              Index X selects from the first vector if (X / NUNITS) % 2 == 0;
   11757              :              X selects from the second vector otherwise.  These conditions
   11758              :              are only known at compile time for constant-length vectors.  */
   11759              :           first_vec = std::make_pair (0, 0);
   11760              :           second_vec = std::make_pair (0, 1);
   11761              :         }
   11762      1499669 :       else if (first_vec.first == -1U
   11763      1499669 :                || first_vec == vperm[ei].first)
   11764      1304558 :         first_vec = vperm[ei].first;
   11765       195111 :       else if (second_vec.first == -1U
   11766       195111 :                || second_vec == vperm[ei].first)
   11767              :         {
   11768       194723 :           second_vec = vperm[ei].first;
   11769       194723 :           mask_element += nunits;
   11770              :         }
   11771              :       else
   11772              :         {
   11773          388 :           if (dump_p)
   11774            7 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11775              :                              "permutation requires at "
   11776              :                              "least three vectors\n");
   11777          388 :           gcc_assert (!gsi);
   11778              :           return -1;
   11779              :         }
   11780              : 
   11781      1662847 :       mask[index++] = mask_element;
   11782              : 
   11783      1662847 :       if (index == count)
   11784              :         {
   11785       719469 :           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
   11786              :                               TYPE_VECTOR_SUBPARTS (op_vectype));
   11787       572952 :           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
   11788       887723 :                              && constant_multiple_p (mask[0], nunits));
   11789       572952 :           machine_mode vmode = TYPE_MODE (vectype);
   11790       572952 :           machine_mode op_vmode = TYPE_MODE (op_vectype);
   11791       572952 :           unsigned HOST_WIDE_INT c;
   11792       572952 :           if ((!identity_p
   11793       532880 :                && !can_vec_perm_const_p (vmode, op_vmode, indices))
   11794       572952 :               || (identity_p
   11795        40072 :                   && !known_le (nunits,
   11796              :                                 TYPE_VECTOR_SUBPARTS (op_vectype))
   11797         7302 :                   && (!constant_multiple_p (nunits,
   11798            8 :                                             TYPE_VECTOR_SUBPARTS (op_vectype),
   11799            8 :                                             &c) || c != 2)))
   11800              :             {
   11801         7294 :               if (dump_p)
   11802              :                 {
   11803          152 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
   11804              :                                    vect_location,
   11805              :                                    "unsupported vect permute { ");
   11806         1586 :                   for (i = 0; i < count; ++i)
   11807              :                     {
   11808         1434 :                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11809         1434 :                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11810              :                     }
   11811          152 :                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11812              :                 }
   11813         7294 :               gcc_assert (!gsi);
   11814         7682 :               return -1;
   11815              :             }
   11816              : 
   11817       565658 :           if (!identity_p)
   11818       525586 :             nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
   11819       565658 :           if (gsi)
   11820              :             {
   11821        31376 :               if (second_vec.first == -1U)
   11822         7001 :                 second_vec = first_vec;
   11823              : 
   11824        31376 :               slp_tree
   11825        31376 :                 first_node = children[first_vec.first],
   11826        31376 :                 second_node = children[second_vec.first];
   11827              : 
   11828        31376 :               tree mask_vec = NULL_TREE;
   11829        31376 :               if (!identity_p)
   11830        28100 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11831              : 
   11832        31376 :               tree first_def
   11833        31376 :                 = vect_get_slp_vect_def (first_node, first_vec.second + vi);
   11834        31376 :               tree second_def
   11835        31376 :                 = vect_get_slp_vect_def (second_node, second_vec.second + vi);
   11836        31376 :               vect_add_slp_permutation (vinfo, gsi, node, first_def,
   11837        31376 :                                         second_def, mask_vec, mask[0]);
   11838              :             }
   11839              : 
   11840              :           index = 0;
   11841              :           first_vec = std::make_pair (-1U, -1U);
   11842              :           second_vec = std::make_pair (-1U, -1U);
   11843              :         }
   11844              :     }
   11845              : 
   11846       427818 :   return nperms;
   11847       435500 : }
   11848              : 
   11849              : /* Vectorize the SLP permutations in NODE as specified
   11850              :    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
   11851              :    child number and lane number.
   11852              :    Interleaving of two two-lane two-child SLP subtrees (not supported):
   11853              :      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
   11854              :    A blend of two four-lane two-child SLP subtrees:
   11855              :      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
   11856              :    Highpart of a four-lane one-child SLP subtree (not supported):
   11857              :      [ { 0, 2 }, { 0, 3 } ]
   11858              :    Where currently only a subset is supported by code generating below.  */
   11859              : 
   11860              : bool
   11861       115741 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11862              :                               slp_tree node, stmt_vector_for_cost *cost_vec)
   11863              : {
   11864       115741 :   tree vectype = SLP_TREE_VECTYPE (node);
   11865       115741 :   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
   11866       115741 :   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
   11867       115741 :                                                SLP_TREE_CHILDREN (node),
   11868              :                                                dump_enabled_p ());
   11869       115741 :   if (nperms < 0)
   11870              :     return false;
   11871              : 
   11872       114412 :   if (!gsi && nperms != 0)
   11873        92917 :     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
   11874              : 
   11875              :   return true;
   11876              : }
   11877              : 
   11878              : /* Vectorize SLP NODE.  */
   11879              : 
   11880              : static void
   11881      1461423 : vect_schedule_slp_node (vec_info *vinfo,
   11882              :                         slp_tree node, slp_instance instance)
   11883              : {
   11884      1461423 :   gimple_stmt_iterator si;
   11885      1461423 :   int i;
   11886      1461423 :   slp_tree child;
   11887              : 
   11888              :   /* Vectorize externals and constants.  */
   11889      1461423 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
   11890      1461423 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
   11891              :     {
   11892              :       /* ???  vectorizable_shift can end up using a scalar operand which is
   11893              :          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
   11894              :          node in this case.  */
   11895       497004 :       if (!SLP_TREE_VECTYPE (node))
   11896       497004 :         return;
   11897              : 
   11898              :       /* There are two reasons vector defs might already exist.  The first
   11899              :          is that we are vectorizing an existing vector def.  The second is
   11900              :          when performing BB vectorization shared constant/external nodes
   11901              :          are not split apart during partitioning so during the code-gen
   11902              :          DFS walk we can end up visiting them twice.  */
   11903       490801 :       if (! SLP_TREE_VEC_DEFS (node).exists ())
   11904       490000 :         vect_create_constant_vectors (vinfo, node);
   11905       490801 :       return;
   11906              :     }
   11907              : 
   11908       964419 :   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   11909              : 
   11910       964419 :   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
   11911       964419 :   if (SLP_TREE_VECTYPE (node))
   11912       964413 :     SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
   11913              : 
   11914       964419 :   if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
   11915              :     {
   11916              :       /* Vectorized loads go before the first scalar load to make it
   11917              :          ready early, vectorized stores go before the last scalar
   11918              :          stmt which is where all uses are ready.  */
   11919       704721 :       stmt_vec_info last_stmt_info = NULL;
   11920       704721 :       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
   11921       163322 :         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
   11922              :       else /* DR_IS_WRITE */
   11923       541399 :         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
   11924       704721 :       si = gsi_for_stmt (last_stmt_info->stmt);
   11925       704721 :     }
   11926       259698 :   else if (!SLP_TREE_PERMUTE_P (node)
   11927       243224 :            && (SLP_TREE_TYPE (node) == cycle_phi_info_type
   11928              :                || SLP_TREE_TYPE (node) == induc_vec_info_type
   11929              :                || SLP_TREE_TYPE (node) == phi_info_type))
   11930              :     {
   11931              :       /* For PHI node vectorization we do not use the insertion iterator.  */
   11932        53968 :       si = gsi_none ();
   11933              :     }
   11934              :   else
   11935              :     {
   11936              :       /* Emit other stmts after the children vectorized defs which is
   11937              :          earliest possible.  */
   11938              :       gimple *last_stmt = NULL;
   11939              :       bool seen_vector_def = false;
   11940       573136 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   11941       367406 :         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
   11942              :           {
   11943              :             /* For fold-left reductions we are retaining the scalar
   11944              :                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
   11945              :                set so the representation isn't perfect.  Resort to the
   11946              :                last scalar def here.  */
   11947       294845 :             if (SLP_TREE_VEC_DEFS (child).is_empty ())
   11948              :               {
   11949          866 :                 gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
   11950          866 :                 gphi *phi = as_a <gphi *>
   11951          866 :                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
   11952          866 :                 if (!last_stmt)
   11953              :                   last_stmt = phi;
   11954          648 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
   11955              :                   last_stmt = phi;
   11956          637 :                 else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
   11957              :                   ;
   11958              :                 else
   11959            0 :                   gcc_unreachable ();
   11960              :               }
   11961              :             /* We are emitting all vectorized stmts in the same place and
   11962              :                the last one is the last.
   11963              :                ???  Unless we have a load permutation applied and that
   11964              :                figures to re-use an earlier generated load.  */
   11965              :             unsigned j;
   11966              :             tree vdef;
   11967       696829 :             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   11968              :               {
   11969       401984 :                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   11970       401984 :                 if (!last_stmt)
   11971              :                   last_stmt = vstmt;
   11972       206850 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   11973              :                   last_stmt = vstmt;
   11974        45417 :                 else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   11975              :                   ;
   11976              :                 else
   11977            0 :                   gcc_unreachable ();
   11978              :               }
   11979              :           }
   11980        72561 :         else if (!SLP_TREE_VECTYPE (child))
   11981              :           {
   11982              :             /* For externals we use unvectorized at all scalar defs.  */
   11983              :             unsigned j;
   11984              :             tree def;
   11985        12903 :             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
   11986         7334 :               if (TREE_CODE (def) == SSA_NAME
   11987         7334 :                   && !SSA_NAME_IS_DEFAULT_DEF (def))
   11988              :                 {
   11989          167 :                   gimple *stmt = SSA_NAME_DEF_STMT (def);
   11990          167 :                   if (gimple_uid (stmt) == -1u)
   11991              :                     /* If the stmt is not inside the region do not
   11992              :                        use it as possible insertion point.  */
   11993              :                     ;
   11994          159 :                   else if (!last_stmt)
   11995              :                     last_stmt = stmt;
   11996          153 :                   else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
   11997              :                     last_stmt = stmt;
   11998          153 :                   else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
   11999              :                     ;
   12000              :                   else
   12001            0 :                     gcc_unreachable ();
   12002              :                 }
   12003              :           }
   12004              :         else
   12005              :           {
   12006              :             /* For externals we have to look at all defs since their
   12007              :                insertion place is decided per vector.  But beware
   12008              :                of pre-existing vectors where we need to make sure
   12009              :                we do not insert before the region boundary.  */
   12010        66992 :             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
   12011          654 :                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
   12012              :               seen_vector_def = true;
   12013              :             else
   12014              :               {
   12015              :                 unsigned j;
   12016              :                 tree vdef;
   12017       529058 :                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   12018        94777 :                   if (TREE_CODE (vdef) == SSA_NAME
   12019        94777 :                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
   12020              :                     {
   12021        19610 :                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   12022        19610 :                       if (!last_stmt)
   12023              :                         last_stmt = vstmt;
   12024        10962 :                       else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   12025              :                         last_stmt = vstmt;
   12026         8709 :                       else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   12027              :                         ;
   12028              :                       else
   12029            0 :                         gcc_unreachable ();
   12030              :                     }
   12031              :               }
   12032              :           }
   12033              :       /* This can happen when all children are pre-existing vectors or
   12034              :          constants.  */
   12035       205730 :       if (!last_stmt)
   12036         1724 :         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
   12037         1724 :       if (!last_stmt)
   12038              :         {
   12039            0 :           gcc_assert (seen_vector_def);
   12040            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   12041              :         }
   12042       205730 :       else if (is_ctrl_altering_stmt (last_stmt))
   12043              :         {
   12044              :           /* We split regions to vectorize at control altering stmts
   12045              :              with a definition so this must be an external which
   12046              :              we can insert at the start of the region.  */
   12047            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   12048              :         }
   12049       205730 :       else if (is_a <bb_vec_info> (vinfo)
   12050        17754 :                && !SLP_TREE_PERMUTE_P (node)
   12051        16331 :                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
   12052       206990 :                && gimple_could_trap_p (stmt_info->stmt))
   12053              :         {
   12054              :           /* We've constrained possibly trapping operations to all come
   12055              :              from the same basic-block, if vectorized defs would allow earlier
   12056              :              scheduling still force vectorized stmts to the original block.
   12057              :              This is only necessary for BB vectorization since for loop vect
   12058              :              all operations are in a single BB and scalar stmt based
   12059              :              placement doesn't play well with epilogue vectorization.  */
   12060           53 :           gcc_assert (dominated_by_p (CDI_DOMINATORS,
   12061              :                                       gimple_bb (stmt_info->stmt),
   12062              :                                       gimple_bb (last_stmt)));
   12063           53 :           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
   12064              :         }
   12065       205677 :       else if (is_a <gphi *> (last_stmt))
   12066        14345 :         si = gsi_after_labels (gimple_bb (last_stmt));
   12067              :       else
   12068              :         {
   12069       191332 :           si = gsi_for_stmt (last_stmt);
   12070       191332 :           gsi_next (&si);
   12071              : 
   12072              :           /* Avoid scheduling internal defs outside of the loop when
   12073              :              we might have only implicitly tracked loop mask/len defs.  */
   12074       191332 :           if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
   12075           74 :             if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
   12076       173843 :                 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
   12077              :               {
   12078           74 :                 gimple_stmt_iterator si2
   12079           74 :                   = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
   12080           74 :                 if ((gsi_end_p (si2)
   12081            0 :                      && (LOOP_VINFO_LOOP (loop_vinfo)->header
   12082            0 :                          != gimple_bb (last_stmt))
   12083            0 :                      && dominated_by_p (CDI_DOMINATORS,
   12084              :                                         LOOP_VINFO_LOOP (loop_vinfo)->header,
   12085            0 :                                         gimple_bb (last_stmt)))
   12086           74 :                     || (!gsi_end_p (si2)
   12087           74 :                         && last_stmt != *si2
   12088           72 :                         && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
   12089            3 :                   si = si2;
   12090              :               }
   12091              :         }
   12092              :     }
   12093              : 
   12094       964419 :   if (dump_enabled_p ())
   12095              :     {
   12096        71845 :       if (stmt_info)
   12097        71792 :         dump_printf_loc (MSG_NOTE, vect_location,
   12098              :                          "------>vectorizing SLP node starting from: %G",
   12099              :                          stmt_info->stmt);
   12100              :       else
   12101              :         {
   12102           53 :           dump_printf_loc (MSG_NOTE, vect_location,
   12103              :                            "------>vectorizing SLP node:\n");
   12104           53 :           vect_print_slp_tree (MSG_NOTE, vect_location, node);
   12105              :         }
   12106              :     }
   12107       964419 :   vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
   12108              : }
   12109              : 
   12110              : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
   12111              :    For loop vectorization this is done in vectorizable_call, but for SLP
   12112              :    it needs to be deferred until end of vect_schedule_slp, because multiple
   12113              :    SLP instances may refer to the same scalar stmt.  */
   12114              : 
   12115              : static void
   12116       600677 : vect_remove_slp_scalar_calls (vec_info *vinfo,
   12117              :                               slp_tree node, hash_set<slp_tree> &visited)
   12118              : {
   12119       600677 :   gimple *new_stmt;
   12120       600677 :   gimple_stmt_iterator gsi;
   12121       600677 :   int i;
   12122       600677 :   slp_tree child;
   12123       600677 :   tree lhs;
   12124       600677 :   stmt_vec_info stmt_info;
   12125              : 
   12126       600677 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12127       188413 :     return;
   12128              : 
   12129       456257 :   if (visited.add (node))
   12130              :     return;
   12131              : 
   12132       923159 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12133       510895 :     vect_remove_slp_scalar_calls (vinfo, child, visited);
   12134              : 
   12135      1305688 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
   12136              :     {
   12137       485375 :       if (!stmt_info)
   12138         3974 :         continue;
   12139       481401 :       stmt_info = vect_orig_stmt (stmt_info);
   12140       481401 :       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
   12141         5231 :       if (!stmt || gimple_bb (stmt) == NULL)
   12142       476208 :         continue;
   12143         5193 :       lhs = gimple_call_lhs (stmt);
   12144         5193 :       if (lhs)
   12145         4579 :         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
   12146              :       else
   12147          614 :         new_stmt = gimple_build_nop ();
   12148         5193 :       unlink_stmt_vdef (stmt_info->stmt);
   12149         5193 :       gsi = gsi_for_stmt (stmt);
   12150         5193 :       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
   12151         5193 :       if (lhs)
   12152         4579 :         SSA_NAME_DEF_STMT (lhs) = new_stmt;
   12153              :     }
   12154              : }
   12155              : 
   12156              : static void
   12157        89782 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
   12158              : {
   12159        89782 :   hash_set<slp_tree> visited;
   12160        89782 :   vect_remove_slp_scalar_calls (vinfo, node, visited);
   12161        89782 : }
   12162              : 
   12163              : /* Vectorize the instance root.  */
   12164              : 
   12165              : void
   12166        10820 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
   12167              : {
   12168        10820 :   gassign *rstmt = NULL;
   12169              : 
   12170        10820 :   if (instance->kind == slp_inst_kind_ctor)
   12171              :     {
   12172         4901 :       if (SLP_TREE_VEC_DEFS (node).length () == 1)
   12173              :         {
   12174         4864 :           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
   12175         4864 :           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12176         4864 :           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
   12177         4864 :                                           TREE_TYPE (vect_lhs)))
   12178            0 :             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
   12179              :                                vect_lhs);
   12180         4864 :           rstmt = gimple_build_assign (root_lhs, vect_lhs);
   12181              :         }
   12182              :       else
   12183              :         {
   12184           37 :           gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
   12185           37 :           tree child_def;
   12186           37 :           int j;
   12187           37 :           vec<constructor_elt, va_gc> *v;
   12188           37 :           vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
   12189              : 
   12190              :           /* A CTOR can handle V16HI composition from VNx8HI so we
   12191              :              do not need to convert vector elements if the types
   12192              :              do not match.  */
   12193          111 :           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
   12194           74 :             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
   12195           37 :           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12196           37 :           tree rtype
   12197           37 :             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
   12198           37 :           tree r_constructor = build_constructor (rtype, v);
   12199           37 :           rstmt = gimple_build_assign (lhs, r_constructor);
   12200              :         }
   12201              :     }
   12202         5919 :   else if (instance->kind == slp_inst_kind_bb_reduc)
   12203              :     {
   12204              :       /* Largely inspired by reduction chain epilogue handling in
   12205              :          vect_create_epilog_for_reduction.  */
   12206         4330 :       vec<tree> vec_defs = vNULL;
   12207         4330 :       vect_get_slp_defs (node, &vec_defs);
   12208         4330 :       enum tree_code reduc_code
   12209         4330 :         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
   12210              :       /* ???  We actually have to reflect signs somewhere.  */
   12211         4330 :       if (reduc_code == MINUS_EXPR)
   12212            0 :         reduc_code = PLUS_EXPR;
   12213         4330 :       gimple_seq epilogue = NULL;
   12214              :       /* We may end up with more than one vector result, reduce them
   12215              :          to one vector.  */
   12216         4330 :       tree vec_def = vec_defs[0];
   12217         4330 :       tree vectype = TREE_TYPE (vec_def);
   12218         4330 :       tree compute_vectype = vectype;
   12219         4330 :       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
   12220         4130 :                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
   12221         7296 :                                  && operation_can_overflow (reduc_code));
   12222         2833 :       if (pun_for_overflow_p)
   12223              :         {
   12224         2833 :           compute_vectype = unsigned_type_for (vectype);
   12225         2833 :           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12226              :                                   compute_vectype, vec_def);
   12227              :         }
   12228         6708 :       for (unsigned i = 1; i < vec_defs.length (); ++i)
   12229              :         {
   12230         2378 :           tree def = vec_defs[i];
   12231         2378 :           if (pun_for_overflow_p)
   12232         2273 :             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12233              :                                 compute_vectype, def);
   12234         2378 :           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
   12235              :                                   vec_def, def);
   12236              :         }
   12237         4330 :       vec_defs.release ();
   12238              :       /* ???  Support other schemes than direct internal fn.  */
   12239         4330 :       internal_fn reduc_fn;
   12240         4330 :       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
   12241         4330 :           || reduc_fn == IFN_LAST)
   12242            0 :         gcc_unreachable ();
   12243         4330 :       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
   12244         4330 :                                       TREE_TYPE (compute_vectype), vec_def);
   12245         4330 :       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
   12246              :         {
   12247         2813 :           tree rem_def = NULL_TREE;
   12248        12403 :           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
   12249              :             {
   12250         9590 :               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
   12251         9590 :               if (!rem_def)
   12252              :                 rem_def = def;
   12253              :               else
   12254         6777 :                 rem_def = gimple_build (&epilogue, reduc_code,
   12255         6777 :                                         TREE_TYPE (scalar_def),
   12256              :                                         rem_def, def);
   12257              :             }
   12258         2813 :           scalar_def = gimple_build (&epilogue, reduc_code,
   12259         2813 :                                      TREE_TYPE (scalar_def),
   12260              :                                      scalar_def, rem_def);
   12261              :         }
   12262         4330 :       scalar_def = gimple_convert (&epilogue,
   12263         4330 :                                    TREE_TYPE (vectype), scalar_def);
   12264         4330 :       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12265         4330 :       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
   12266         4330 :       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
   12267         4330 :       update_stmt (gsi_stmt (rgsi));
   12268         4330 :       return;
   12269              :     }
   12270         1589 :   else if (instance->kind == slp_inst_kind_gcond)
   12271              :     {
   12272              :       /* Only support a single root for now as we can't codegen CFG yet and so we
   12273              :          can't support lane > 1 at this time.  */
   12274         1589 :       gcc_assert (instance->root_stmts.length () == 1);
   12275         1589 :       auto root_stmt_info = instance->root_stmts[0];
   12276         1589 :       auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
   12277         1589 :       gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
   12278         1589 :       gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
   12279         1589 :       bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
   12280              :                                           root_stmt_info, &rgsi, node, NULL);
   12281         1589 :       gcc_assert (res);
   12282         1589 :       return;
   12283              :     }
   12284              :   else
   12285            0 :     gcc_unreachable ();
   12286              : 
   12287         4901 :   gcc_assert (rstmt);
   12288              : 
   12289         4901 :   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12290         4901 :   gsi_replace (&rgsi, rstmt, true);
   12291              : }
   12292              : 
   12293              : struct slp_scc_info
   12294              : {
   12295              :   bool on_stack;
   12296              :   int dfs;
   12297              :   int lowlink;
   12298              : };
   12299              : 
   12300              : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
   12301              : 
   12302              : static void
   12303      1461423 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
   12304              :                    hash_map<slp_tree, slp_scc_info> &scc_info,
   12305              :                    int &maxdfs, vec<slp_tree> &stack)
   12306              : {
   12307      1461423 :   bool existed_p;
   12308      1461423 :   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
   12309      1461423 :   gcc_assert (!existed_p);
   12310      1461423 :   info->dfs = maxdfs;
   12311      1461423 :   info->lowlink = maxdfs;
   12312      1461423 :   maxdfs++;
   12313              : 
   12314              :   /* Leaf.  */
   12315      1461423 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12316              :     {
   12317       497004 :       info->on_stack = false;
   12318       497004 :       vect_schedule_slp_node (vinfo, node, instance);
   12319      1025668 :       return;
   12320              :     }
   12321              : 
   12322       964419 :   info->on_stack = true;
   12323       964419 :   stack.safe_push (node);
   12324              : 
   12325       964419 :   unsigned i;
   12326       964419 :   slp_tree child;
   12327              :   /* DFS recurse.  */
   12328      1992970 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12329              :     {
   12330      1028551 :       if (!child)
   12331        55101 :         continue;
   12332       973450 :       slp_scc_info *child_info = scc_info.get (child);
   12333       973450 :       if (!child_info)
   12334              :         {
   12335       883902 :           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
   12336              :           /* Recursion might have re-allocated the node.  */
   12337       883902 :           info = scc_info.get (node);
   12338       883902 :           child_info = scc_info.get (child);
   12339       883902 :           info->lowlink = MIN (info->lowlink, child_info->lowlink);
   12340              :         }
   12341        89548 :       else if (child_info->on_stack)
   12342        25492 :         info->lowlink = MIN (info->lowlink, child_info->dfs);
   12343              :     }
   12344       964419 :   if (info->lowlink != info->dfs)
   12345              :     return;
   12346              : 
   12347       932759 :   auto_vec<slp_tree, 4> phis_to_fixup;
   12348              : 
   12349              :   /* Singleton.  */
   12350       932759 :   if (stack.last () == node)
   12351              :     {
   12352       908922 :       stack.pop ();
   12353       908922 :       info->on_stack = false;
   12354       908922 :       vect_schedule_slp_node (vinfo, node, instance);
   12355       908922 :       if (!SLP_TREE_PERMUTE_P (node)
   12356       908922 :           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
   12357        30239 :         phis_to_fixup.quick_push (node);
   12358              :     }
   12359              :   else
   12360              :     {
   12361              :       /* SCC.  */
   12362        23837 :       int last_idx = stack.length () - 1;
   12363        55497 :       while (stack[last_idx] != node)
   12364        31660 :         last_idx--;
   12365              :       /* We can break the cycle at PHIs who have at least one child
   12366              :          code generated.  Then we could re-start the DFS walk until
   12367              :          all nodes in the SCC are covered (we might have new entries
   12368              :          for only back-reachable nodes).  But it's simpler to just
   12369              :          iterate and schedule those that are ready.  */
   12370        23837 :       unsigned todo = stack.length () - last_idx;
   12371        24164 :       do
   12372              :         {
   12373       105555 :           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
   12374              :             {
   12375        57227 :               slp_tree entry = stack[idx];
   12376        57227 :               if (!entry)
   12377          934 :                 continue;
   12378        56293 :               bool phi = (!SLP_TREE_PERMUTE_P (entry)
   12379        56293 :                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
   12380        56293 :               bool ready = !phi;
   12381       142467 :               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
   12382       111213 :                   if (!child)
   12383              :                     {
   12384        22983 :                       gcc_assert (phi);
   12385              :                       ready = true;
   12386              :                       break;
   12387              :                     }
   12388        88230 :                   else if (scc_info.get (child)->on_stack)
   12389              :                     {
   12390        24027 :                       if (!phi)
   12391              :                         {
   12392              :                           ready = false;
   12393              :                           break;
   12394              :                         }
   12395              :                     }
   12396              :                   else
   12397              :                     {
   12398        64203 :                       if (phi)
   12399              :                         {
   12400              :                           ready = true;
   12401              :                           break;
   12402              :                         }
   12403              :                     }
   12404        33310 :               if (ready)
   12405              :                 {
   12406        55497 :                   vect_schedule_slp_node (vinfo, entry, instance);
   12407        55497 :                   scc_info.get (entry)->on_stack = false;
   12408        55497 :                   stack[idx] = NULL;
   12409        55497 :                   todo--;
   12410        55497 :                   if (phi)
   12411        24273 :                     phis_to_fixup.safe_push (entry);
   12412              :                 }
   12413              :             }
   12414              :         }
   12415        24164 :       while (todo != 0);
   12416              : 
   12417              :       /* Pop the SCC.  */
   12418        23837 :       stack.truncate (last_idx);
   12419              :     }
   12420              : 
   12421              :   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
   12422              :   slp_tree phi_node;
   12423      1920030 :   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
   12424              :     {
   12425        54512 :       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
   12426        54512 :       edge_iterator ei;
   12427        54512 :       edge e;
   12428       171804 :       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
   12429              :         {
   12430       117292 :           unsigned dest_idx = e->dest_idx;
   12431       117292 :           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
   12432       117292 :           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
   12433        66013 :             continue;
   12434        51279 :           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
   12435              :           /* Simply fill all args.  */
   12436        51279 :           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
   12437              :               != vect_first_order_recurrence)
   12438       110252 :             for (unsigned i = 0; i < n; ++i)
   12439              :               {
   12440        59013 :                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
   12441        59013 :                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
   12442        59013 :                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
   12443              :                              e, gimple_phi_arg_location (phi, dest_idx));
   12444              :               }
   12445              :           else
   12446              :             {
   12447              :               /* Unless it is a first order recurrence which needs
   12448              :                  args filled in for both the PHI node and the permutes.  */
   12449           40 :               gimple *perm
   12450           40 :                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
   12451           40 :               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
   12452           40 :               add_phi_arg (as_a <gphi *> (rphi),
   12453              :                            vect_get_slp_vect_def (child, n - 1),
   12454              :                            e, gimple_phi_arg_location (phi, dest_idx));
   12455          117 :               for (unsigned i = 0; i < n; ++i)
   12456              :                 {
   12457           77 :                   gimple *perm
   12458           77 :                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
   12459           77 :                   if (i > 0)
   12460           37 :                     gimple_assign_set_rhs1 (perm,
   12461              :                                             vect_get_slp_vect_def (child, i - 1));
   12462           77 :                   gimple_assign_set_rhs2 (perm,
   12463              :                                           vect_get_slp_vect_def (child, i));
   12464           77 :                   update_stmt (perm);
   12465              :                 }
   12466              :             }
   12467              :         }
   12468              :     }
   12469       932759 : }
   12470              : 
   12471              : /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
   12472              : 
   12473              : void
   12474       538891 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
   12475              : {
   12476       538891 :   slp_instance instance;
   12477       538891 :   unsigned int i;
   12478              : 
   12479       538891 :   hash_map<slp_tree, slp_scc_info> scc_info;
   12480       538891 :   int maxdfs = 0;
   12481      1116517 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12482              :     {
   12483       577626 :       slp_tree node = SLP_INSTANCE_TREE (instance);
   12484       577626 :       if (dump_enabled_p ())
   12485              :         {
   12486        16008 :           dump_printf_loc (MSG_NOTE, vect_location,
   12487              :                            "Vectorizing SLP tree:\n");
   12488              :           /* ???  Dump all?  */
   12489        16008 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12490          469 :             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
   12491          469 :                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
   12492        16008 :           vect_print_slp_graph (MSG_NOTE, vect_location,
   12493              :                                 SLP_INSTANCE_TREE (instance));
   12494              :         }
   12495              :       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
   12496              :          have a PHI be the node breaking the cycle.  */
   12497       577626 :       auto_vec<slp_tree> stack;
   12498       577626 :       if (!scc_info.get (node))
   12499       577521 :         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
   12500              : 
   12501       577626 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12502        10820 :         vectorize_slp_instance_root_stmt (vinfo, node, instance);
   12503              : 
   12504       577626 :       if (dump_enabled_p ())
   12505        16008 :         dump_printf_loc (MSG_NOTE, vect_location,
   12506              :                          "vectorizing stmts using SLP.\n");
   12507       577626 :     }
   12508              : 
   12509      1655408 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12510              :     {
   12511       577626 :       slp_tree root = SLP_INSTANCE_TREE (instance);
   12512       577626 :       stmt_vec_info store_info;
   12513       577626 :       unsigned int j;
   12514              : 
   12515              :       /* Remove scalar call stmts.  Do not do this for basic-block
   12516              :          vectorization as not all uses may be vectorized.
   12517              :          ???  Why should this be necessary?  DCE should be able to
   12518              :          remove the stmts itself.
   12519              :          ???  For BB vectorization we can as well remove scalar
   12520              :          stmts starting from the SLP tree root if they have no
   12521              :          uses.  */
   12522       577626 :       if (is_a <loop_vec_info> (vinfo))
   12523        89782 :         vect_remove_slp_scalar_calls (vinfo, root);
   12524              : 
   12525              :       /* Remove vectorized stores original scalar stmts.  */
   12526      2575575 :       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
   12527              :         {
   12528      1456550 :           if (!store_info
   12529      1456536 :               || !STMT_VINFO_DATA_REF (store_info)
   12530      1428964 :               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
   12531              :             break;
   12532              : 
   12533      1420323 :           store_info = vect_orig_stmt (store_info);
   12534              :           /* Free the attached stmt_vec_info and remove the stmt.  */
   12535      1420323 :           vinfo->remove_stmt (store_info);
   12536              : 
   12537              :           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
   12538              :              to not crash in vect_free_slp_tree later.  */
   12539      1420323 :           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
   12540       541100 :             SLP_TREE_REPRESENTATIVE (root) = NULL;
   12541              :         }
   12542              :     }
   12543       538891 : }
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.