LCOV - code coverage report
Current view: top level - gcc - tree-vect-loop.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 89.2 % 5139 4584
Test Date: 2026-06-20 15:32:29 Functions: 94.9 % 98 93
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* Loop Vectorization
       2              :    Copyright (C) 2003-2026 Free Software Foundation, Inc.
       3              :    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
       4              :    Ira Rosen <irar@il.ibm.com>
       5              : 
       6              : This file is part of GCC.
       7              : 
       8              : GCC is free software; you can redistribute it and/or modify it under
       9              : the terms of the GNU General Public License as published by the Free
      10              : Software Foundation; either version 3, or (at your option) any later
      11              : version.
      12              : 
      13              : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      14              : WARRANTY; without even the implied warranty of MERCHANTABILITY or
      15              : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      16              : for more details.
      17              : 
      18              : You should have received a copy of the GNU General Public License
      19              : along with GCC; see the file COPYING3.  If not see
      20              : <http://www.gnu.org/licenses/>.  */
      21              : 
      22              : #define INCLUDE_ALGORITHM
      23              : #include "config.h"
      24              : #include "system.h"
      25              : #include "coretypes.h"
      26              : #include "backend.h"
      27              : #include "target.h"
      28              : #include "rtl.h"
      29              : #include "tree.h"
      30              : #include "gimple.h"
      31              : #include "cfghooks.h"
      32              : #include "tree-pass.h"
      33              : #include "ssa.h"
      34              : #include "optabs-tree.h"
      35              : #include "memmodel.h"
      36              : #include "optabs.h"
      37              : #include "diagnostic-core.h"
      38              : #include "fold-const.h"
      39              : #include "stor-layout.h"
      40              : #include "cfganal.h"
      41              : #include "gimplify.h"
      42              : #include "gimple-iterator.h"
      43              : #include "gimplify-me.h"
      44              : #include "tree-ssa-loop-ivopts.h"
      45              : #include "tree-ssa-loop-manip.h"
      46              : #include "tree-ssa-loop-niter.h"
      47              : #include "tree-ssa-loop.h"
      48              : #include "cfgloop.h"
      49              : #include "tree-scalar-evolution.h"
      50              : #include "tree-vectorizer.h"
      51              : #include "gimple-fold.h"
      52              : #include "cgraph.h"
      53              : #include "tree-cfg.h"
      54              : #include "tree-if-conv.h"
      55              : #include "internal-fn.h"
      56              : #include "tree-vector-builder.h"
      57              : #include "vec-perm-indices.h"
      58              : #include "tree-eh.h"
      59              : #include "case-cfn-macros.h"
      60              : #include "langhooks.h"
      61              : #include "opts.h"
      62              : #include "hierarchical_discriminator.h"
      63              : 
      64              : /* Loop Vectorization Pass.
      65              : 
      66              :    This pass tries to vectorize loops.
      67              : 
      68              :    For example, the vectorizer transforms the following simple loop:
      69              : 
      70              :         short a[N]; short b[N]; short c[N]; int i;
      71              : 
      72              :         for (i=0; i<N; i++){
      73              :           a[i] = b[i] + c[i];
      74              :         }
      75              : 
      76              :    as if it was manually vectorized by rewriting the source code into:
      77              : 
      78              :         typedef int __attribute__((mode(V8HI))) v8hi;
      79              :         short a[N];  short b[N]; short c[N];   int i;
      80              :         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
      81              :         v8hi va, vb, vc;
      82              : 
      83              :         for (i=0; i<N/8; i++){
      84              :           vb = pb[i];
      85              :           vc = pc[i];
      86              :           va = vb + vc;
      87              :           pa[i] = va;
      88              :         }
      89              : 
      90              :         The main entry to this pass is vectorize_loops(), in which
      91              :    the vectorizer applies a set of analyses on a given set of loops,
      92              :    followed by the actual vectorization transformation for the loops that
      93              :    had successfully passed the analysis phase.
      94              :         Throughout this pass we make a distinction between two types of
      95              :    data: scalars (which are represented by SSA_NAMES), and memory references
      96              :    ("data-refs").  These two types of data require different handling both
      97              :    during analysis and transformation. The types of data-refs that the
      98              :    vectorizer currently supports are ARRAY_REFS which base is an array DECL
      99              :    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
     100              :    accesses are required to have a simple (consecutive) access pattern.
     101              : 
     102              :    Analysis phase:
     103              :    ===============
     104              :         The driver for the analysis phase is vect_analyze_loop().
     105              :    It applies a set of analyses, some of which rely on the scalar evolution
     106              :    analyzer (scev) developed by Sebastian Pop.
     107              : 
     108              :         During the analysis phase the vectorizer records some information
     109              :    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
     110              :    loop, as well as general information about the loop as a whole, which is
     111              :    recorded in a "loop_vec_info" struct attached to each loop.
     112              : 
     113              :    Transformation phase:
     114              :    =====================
     115              :         The loop transformation phase scans all the stmts in the loop, and
     116              :    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
     117              :    the loop that needs to be vectorized.  It inserts the vector code sequence
     118              :    just before the scalar stmt S, and records a pointer to the vector code
     119              :    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
     120              :    attached to S).  This pointer will be used for the vectorization of following
     121              :    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
     122              :    otherwise, we rely on dead code elimination for removing it.
     123              : 
     124              :         For example, say stmt S1 was vectorized into stmt VS1:
     125              : 
     126              :    VS1: vb = px[i];
     127              :    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
     128              :    S2:  a = b;
     129              : 
     130              :    To vectorize stmt S2, the vectorizer first finds the stmt that defines
     131              :    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
     132              :    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
     133              :    resulting sequence would be:
     134              : 
     135              :    VS1: vb = px[i];
     136              :    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
     137              :    VS2: va = vb;
     138              :    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
     139              : 
     140              :         Operands that are not SSA_NAMEs, are data-refs that appear in
     141              :    load/store operations (like 'x[i]' in S1), and are handled differently.
     142              : 
     143              :    Target modeling:
     144              :    =================
     145              :         Currently the only target specific information that is used is the
     146              :    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
     147              :    Targets that can support different sizes of vectors, for now will need
     148              :    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
     149              :    flexibility will be added in the future.
     150              : 
     151              :         Since we only vectorize operations which vector form can be
     152              :    expressed using existing tree codes, to verify that an operation is
     153              :    supported, the vectorizer checks the relevant optab at the relevant
     154              :    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
     155              :    the value found is CODE_FOR_nothing, then there's no target support, and
     156              :    we can't vectorize the stmt.
     157              : 
     158              :    For additional information on this project see:
     159              :    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
     160              : */
     161              : 
     162              : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
     163              :                                                 unsigned *);
     164              : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
     165              :                                                gphi **);
     166              : 
     167              : 
     168              : /* Function vect_is_simple_iv_evolution.
     169              : 
     170              :    FORNOW: A simple evolution of an induction variables in the loop is
     171              :    considered a polynomial evolution.  */
     172              : 
     173              : static bool
     174       909546 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
     175              :                              stmt_vec_info stmt_info)
     176              : {
     177       909546 :   tree init_expr;
     178       909546 :   tree step_expr;
     179       909546 :   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
     180       909546 :   basic_block bb;
     181              : 
     182              :   /* When there is no evolution in this loop, the evolution function
     183              :      is not "simple".  */
     184       909546 :   if (evolution_part == NULL_TREE)
     185              :     return false;
     186              : 
     187              :   /* When the evolution is a polynomial of degree >= 2
     188              :      the evolution function is not "simple".  */
     189       973395 :   if (tree_is_chrec (evolution_part))
     190              :     return false;
     191              : 
     192       799270 :   step_expr = evolution_part;
     193       799270 :   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
     194              : 
     195       799270 :   if (dump_enabled_p ())
     196        39910 :     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
     197              :                      step_expr, init_expr);
     198              : 
     199       799270 :   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
     200       799270 :   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
     201              : 
     202       799270 :   if (TREE_CODE (step_expr) != INTEGER_CST
     203        71598 :       && (TREE_CODE (step_expr) != SSA_NAME
     204        56513 :           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
     205        56250 :               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
     206         7787 :           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
     207          131 :               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
     208          131 :                   || !flag_associative_math)))
     209       863176 :       && (TREE_CODE (step_expr) != REAL_CST
     210          431 :           || !flag_associative_math))
     211              :     {
     212        63849 :       if (dump_enabled_p ())
     213         3086 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     214              :                          "step unknown.\n");
     215        63849 :       return false;
     216              :     }
     217              : 
     218              :   return true;
     219              : }
     220              : 
     221              : /* Function vect_is_nonlinear_iv_evolution
     222              : 
     223              :    Only support nonlinear induction for integer type
     224              :    1. neg
     225              :    2. mul by constant
     226              :    3. lshift/rshift by constant.
     227              : 
     228              :    For neg induction, return a fake step as integer -1.  */
     229              : static bool
     230       171558 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
     231              :                                 gphi* loop_phi_node)
     232              : {
     233       171558 :   tree init_expr, ev_expr, result, op1, op2;
     234       171558 :   gimple* def;
     235              : 
     236       171558 :   if (gimple_phi_num_args (loop_phi_node) != 2)
     237              :     return false;
     238              : 
     239       171558 :   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
     240       171558 :   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
     241              : 
     242              :   /* Support nonlinear induction only for integer type.  */
     243       171558 :   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
     244              :     return false;
     245              : 
     246       108323 :   result = PHI_RESULT (loop_phi_node);
     247              : 
     248       108323 :   if (TREE_CODE (ev_expr) != SSA_NAME
     249       106060 :       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
     250       108323 :       || !is_gimple_assign (def))
     251              :     return false;
     252              : 
     253        97524 :   enum tree_code t_code = gimple_assign_rhs_code (def);
     254        97524 :   tree step;
     255        97524 :   switch (t_code)
     256              :     {
     257         3510 :     case NEGATE_EXPR:
     258         3510 :       if (gimple_assign_rhs1 (def) != result)
     259              :         return false;
     260         3510 :       step = build_int_cst (TREE_TYPE (init_expr), -1);
     261         3510 :       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
     262         3510 :       break;
     263              : 
     264        11326 :     case RSHIFT_EXPR:
     265        11326 :     case LSHIFT_EXPR:
     266        11326 :     case MULT_EXPR:
     267        11326 :       op1 = gimple_assign_rhs1 (def);
     268        11326 :       op2 = gimple_assign_rhs2 (def);
     269        11326 :       if (TREE_CODE (op2) != INTEGER_CST
     270         7438 :           || op1 != result)
     271              :         return false;
     272         7051 :       step = op2;
     273         7051 :       if (t_code == LSHIFT_EXPR)
     274          472 :         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
     275         6579 :       else if (t_code == RSHIFT_EXPR)
     276         5609 :         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
     277              :       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
     278              :       else
     279          970 :         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
     280              :       break;
     281              : 
     282              :     default:
     283              :       return false;
     284              :     }
     285              : 
     286        10561 :   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
     287        10561 :   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
     288              : 
     289        10561 :   return true;
     290              : }
     291              : 
     292              : /* Returns true if Phi is a first-order recurrence. A first-order
     293              :    recurrence is a non-reduction recurrence relation in which the value of
     294              :    the recurrence in the current loop iteration equals a value defined in
     295              :    the previous iteration.  */
     296              : 
     297              : static bool
     298        66211 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
     299              :                                    gphi *phi)
     300              : {
     301              :   /* A nested cycle isn't vectorizable as first order recurrence.  */
     302        66211 :   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
     303              :     return false;
     304              : 
     305              :   /* Ensure the loop latch definition is from within the loop.  */
     306        66069 :   edge latch = loop_latch_edge (loop);
     307        66069 :   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
     308        66069 :   if (TREE_CODE (ldef) != SSA_NAME
     309        63463 :       || SSA_NAME_IS_DEFAULT_DEF (ldef)
     310        63397 :       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
     311       124905 :       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
     312         7886 :     return false;
     313              : 
     314        58183 :   tree def = gimple_phi_result (phi);
     315              : 
     316              :   /* Ensure every use_stmt of the phi node is dominated by the latch
     317              :      definition.  */
     318        58183 :   imm_use_iterator imm_iter;
     319        58183 :   use_operand_p use_p;
     320       128681 :   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
     321        69987 :     if (!is_gimple_debug (USE_STMT (use_p))
     322       136344 :         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
     323        45855 :             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
     324              :                                             USE_STMT (use_p))))
     325        57672 :       return false;
     326              : 
     327              :   /* First-order recurrence autovectorization needs shuffle vector.  */
     328          511 :   tree scalar_type = TREE_TYPE (def);
     329          511 :   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
     330          511 :   if (!vectype)
     331              :     return false;
     332              : 
     333              :   return true;
     334              : }
     335              : 
     336              : /* Function vect_analyze_scalar_cycles_1.
     337              : 
     338              :    Examine the cross iteration def-use cycles of scalar variables
     339              :    in LOOP.  LOOP_VINFO represents the loop that is now being
     340              :    considered for vectorization (can be LOOP, or an outer-loop
     341              :    enclosing LOOP).  SLP indicates there will be some subsequent
     342              :    slp analyses or not.  */
     343              : 
     344              : static void
     345       446232 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
     346              : {
     347       446232 :   basic_block bb = loop->header;
     348       446232 :   auto_vec<stmt_vec_info, 64> worklist;
     349       446232 :   gphi_iterator gsi;
     350              : 
     351       446232 :   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
     352              : 
     353              :   /* First - identify all inductions.  Reduction detection assumes that all the
     354              :      inductions have been identified, therefore, this order must not be
     355              :      changed.  */
     356      1597170 :   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
     357              :     {
     358      1150938 :       gphi *phi = gsi.phi ();
     359      1150938 :       tree access_fn = NULL;
     360      1150938 :       tree def = PHI_RESULT (phi);
     361      1150938 :       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
     362              : 
     363              :       /* Skip virtual phi's.  The data dependences that are associated with
     364              :          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
     365      2301876 :       if (virtual_operand_p (def))
     366       404962 :         continue;
     367              : 
     368              :       /* Skip already analyzed inner loop PHIs of double reductions.  */
     369       910551 :       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
     370         1005 :         continue;
     371              : 
     372       909546 :       if (dump_enabled_p ())
     373        42038 :         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
     374              :                          (gimple *) phi);
     375              : 
     376       909546 :       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
     377              : 
     378              :       /* Analyze the evolution function.  */
     379       909546 :       access_fn = analyze_scalar_evolution (loop, def);
     380       909546 :       if (dump_enabled_p ())
     381        42038 :         dump_printf_loc (MSG_NOTE, vect_location,
     382              :                          "Access function of PHI: %T\n", access_fn);
     383       909546 :       if (access_fn)
     384       909546 :         STRIP_NOPS (access_fn);
     385              : 
     386      1073116 :       if ((!access_fn
     387       909546 :            || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
     388       735421 :            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
     389        11368 :                && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
     390              :                    != INTEGER_CST)))
     391              :           /* Only handle nonlinear iv for same loop.  */
     392      1083677 :           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
     393       171558 :               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
     394              :         {
     395       163570 :           worklist.safe_push (stmt_vinfo);
     396       163570 :           continue;
     397              :         }
     398              : 
     399       745976 :       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
     400              :                   != NULL_TREE);
     401       745976 :       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
     402              : 
     403       745976 :       if (dump_enabled_p ())
     404        36933 :         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
     405       745976 :       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
     406              : 
     407              :       /* Mark if we have a non-linear IV.  */
     408       745976 :       LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
     409       745976 :         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
     410              :     }
     411              : 
     412              : 
     413              :   /* Second - identify all reductions and nested cycles.  */
     414       609802 :   while (worklist.length () > 0)
     415              :     {
     416       163570 :       stmt_vec_info stmt_vinfo = worklist.pop ();
     417       163570 :       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
     418       163570 :       tree def = PHI_RESULT (phi);
     419              : 
     420       163570 :       if (dump_enabled_p ())
     421         5105 :         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
     422              :                          (gimple *) phi);
     423              : 
     424       327140 :       gcc_assert (!virtual_operand_p (def)
     425              :                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
     426              : 
     427       163570 :       gphi *double_reduc;
     428       163570 :       stmt_vec_info reduc_stmt_info
     429       163570 :         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
     430       163570 :       if (reduc_stmt_info && double_reduc)
     431              :         {
     432         1107 :           stmt_vec_info inner_phi_info
     433         1107 :               = loop_vinfo->lookup_stmt (double_reduc);
     434              :           /* ???  Pass down flag we're the inner loop of a double reduc.  */
     435         1107 :           stmt_vec_info inner_reduc_info
     436         1107 :             = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
     437         1107 :           if (inner_reduc_info)
     438              :             {
     439         1005 :               STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
     440         1005 :               STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
     441         1005 :               STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
     442         1005 :               STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
     443         1005 :               if (dump_enabled_p ())
     444          130 :                 dump_printf_loc (MSG_NOTE, vect_location,
     445              :                                  "Detected double reduction.\n");
     446              : 
     447         1005 :               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
     448         1005 :               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
     449         1005 :               STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
     450              :               /* Make it accessible for SLP vectorization.  */
     451         1005 :               LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
     452              :             }
     453          102 :           else if (dump_enabled_p ())
     454           14 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     455              :                              "Unknown def-use cycle pattern.\n");
     456              :         }
     457       162463 :       else if (reduc_stmt_info)
     458              :         {
     459        96252 :           if (loop != LOOP_VINFO_LOOP (loop_vinfo))
     460              :             {
     461         2431 :               if (dump_enabled_p ())
     462          434 :                 dump_printf_loc (MSG_NOTE, vect_location,
     463              :                                  "Detected vectorizable nested cycle.\n");
     464              : 
     465         2431 :               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
     466              :             }
     467              :           else
     468              :             {
     469        93821 :               STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
     470        93821 :               STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
     471        93821 :               if (dump_enabled_p ())
     472         3974 :                 dump_printf_loc (MSG_NOTE, vect_location,
     473              :                                  "Detected reduction.\n");
     474              : 
     475        93821 :               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
     476        93821 :               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
     477        93821 :               LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
     478              :             }
     479              :         }
     480        66211 :       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
     481          505 :         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
     482              :       else
     483        65706 :         if (dump_enabled_p ())
     484          476 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     485              :                            "Unknown def-use cycle pattern.\n");
     486              :     }
     487       446232 : }
     488              : 
     489              : 
     490              : /* Function vect_analyze_scalar_cycles.
     491              : 
     492              :    Examine the cross iteration def-use cycles of scalar variables, by
     493              :    analyzing the loop-header PHIs of scalar variables.  Classify each
     494              :    cycle as one of the following: invariant, induction, reduction, unknown.
     495              :    We do that for the loop represented by LOOP_VINFO, and also to its
     496              :    inner-loop, if exists.
     497              :    Examples for scalar cycles:
     498              : 
     499              :    Example1: reduction:
     500              : 
     501              :               loop1:
     502              :               for (i=0; i<N; i++)
     503              :                  sum += a[i];
     504              : 
     505              :    Example2: induction:
     506              : 
     507              :               loop2:
     508              :               for (i=0; i<N; i++)
     509              :                  a[i] = i;  */
     510              : 
     511              : static void
     512       440452 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
     513              : {
     514       440452 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
     515              : 
     516       440452 :   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
     517              : 
     518              :   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
     519              :      Reductions in such inner-loop therefore have different properties than
     520              :      the reductions in the nest that gets vectorized:
     521              :      1. When vectorized, they are executed in the same order as in the original
     522              :         scalar loop, so we can't change the order of computation when
     523              :         vectorizing them.
     524              :      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
     525              :         current checks are too strict.  */
     526              : 
     527       440452 :   if (loop->inner)
     528         5780 :     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
     529       440452 : }
     530              : 
     531              : /* Function vect_get_loop_niters.
     532              : 
     533              :    Determine how many iterations the loop is executed and place it
     534              :    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
     535              :    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
     536              :    niter information holds in ASSUMPTIONS.
     537              : 
     538              :    Return the loop exit conditions.  */
     539              : 
     540              : 
     541              : static vec<gcond *>
     542       278515 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
     543              :                       tree *number_of_iterations, tree *number_of_iterationsm1)
     544              : {
     545       278515 :   auto_vec<edge> exits = get_loop_exit_edges (loop);
     546       278515 :   vec<gcond *> conds;
     547       557030 :   conds.create (exits.length ());
     548       278515 :   class tree_niter_desc niter_desc;
     549       278515 :   tree niter_assumptions, niter, may_be_zero;
     550              : 
     551       278515 :   *assumptions = boolean_true_node;
     552       278515 :   *number_of_iterationsm1 = chrec_dont_know;
     553       278515 :   *number_of_iterations = chrec_dont_know;
     554              : 
     555       278515 :   DUMP_VECT_SCOPE ("get_loop_niters");
     556              : 
     557       278515 :   if (exits.is_empty ())
     558            0 :     return conds;
     559              : 
     560       278515 :   if (dump_enabled_p ())
     561        14647 :     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
     562              :                      exits.length ());
     563              : 
     564              :   edge exit;
     565              :   unsigned int i;
     566       677583 :   FOR_EACH_VEC_ELT (exits, i, exit)
     567              :     {
     568       399068 :       gcond *cond = get_loop_exit_condition (exit);
     569       399068 :       if (cond)
     570       399035 :         conds.safe_push (cond);
     571              : 
     572       399068 :       if (dump_enabled_p ())
     573        15803 :         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
     574              : 
     575       399068 :       if (exit != main_exit)
     576       178800 :         continue;
     577              : 
     578       278515 :       may_be_zero = NULL_TREE;
     579       278515 :       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
     580       278515 :           || chrec_contains_undetermined (niter_desc.niter))
     581        58247 :         continue;
     582              : 
     583       220268 :       niter_assumptions = niter_desc.assumptions;
     584       220268 :       may_be_zero = niter_desc.may_be_zero;
     585       220268 :       niter = niter_desc.niter;
     586              : 
     587       220268 :       if (may_be_zero && integer_zerop (may_be_zero))
     588              :         may_be_zero = NULL_TREE;
     589              : 
     590         9480 :       if (may_be_zero)
     591              :         {
     592         9480 :           if (COMPARISON_CLASS_P (may_be_zero))
     593              :             {
     594              :               /* Try to combine may_be_zero with assumptions, this can simplify
     595              :                  computation of niter expression.  */
     596         9480 :               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
     597          951 :                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
     598              :                                                  niter_assumptions,
     599              :                                                  fold_build1 (TRUTH_NOT_EXPR,
     600              :                                                               boolean_type_node,
     601              :                                                               may_be_zero));
     602              :               else
     603         8529 :                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
     604              :                                      build_int_cst (TREE_TYPE (niter), 0),
     605              :                                      rewrite_to_non_trapping_overflow (niter));
     606              : 
     607       220268 :               may_be_zero = NULL_TREE;
     608              :             }
     609            0 :           else if (integer_nonzerop (may_be_zero))
     610              :             {
     611            0 :               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
     612            0 :               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
     613            0 :               continue;
     614              :             }
     615              :           else
     616            0 :             continue;
     617              :        }
     618              : 
     619              :       /* Loop assumptions are based off the normal exit.  */
     620       220268 :       *assumptions = niter_assumptions;
     621       220268 :       *number_of_iterationsm1 = niter;
     622              : 
     623              :       /* We want the number of loop header executions which is the number
     624              :          of latch executions plus one.
     625              :          ???  For UINT_MAX latch executions this number overflows to zero
     626              :          for loops like do { n++; } while (n != 0);  */
     627       220268 :       if (niter && !chrec_contains_undetermined (niter))
     628              :         {
     629       220268 :           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
     630              :                                unshare_expr (niter),
     631              :                                build_int_cst (TREE_TYPE (niter), 1));
     632       220268 :           if (TREE_CODE (niter) == INTEGER_CST
     633       121261 :               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
     634              :             {
     635              :               /* If we manage to fold niter + 1 into INTEGER_CST even when
     636              :                  niter is some complex expression, ensure back
     637              :                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
     638              :                  PR113210.  */
     639            0 :               *number_of_iterationsm1
     640            0 :                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
     641              :                                build_minus_one_cst (TREE_TYPE (niter)));
     642              :             }
     643              :         }
     644       220268 :       *number_of_iterations = niter;
     645              :     }
     646              : 
     647       278515 :   if (dump_enabled_p ())
     648        14647 :     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
     649              : 
     650       278515 :   return conds;
     651       278515 : }
     652              : 
     653              : /*  Determine the main loop exit for the vectorizer.  */
     654              : 
     655              : edge
     656       490724 : vec_init_loop_exit_info (class loop *loop)
     657              : {
     658              :   /* Before we begin we must first determine which exit is the main one and
     659              :      which are auxiliary exits.  */
     660       490724 :   auto_vec<edge> exits = get_loop_exit_edges (loop);
     661       976393 :   if (exits.length () == 0)
     662              :     return NULL;
     663       485669 :   if (exits.length () == 1)
     664       320846 :     return exits[0];
     665              : 
     666              :   /* If we have multiple exits, look for counting IV exit.
     667              :      Analyze all exits and return the last one we can analyze.  */
     668       164823 :   class tree_niter_desc niter_desc;
     669       164823 :   edge candidate = NULL;
     670       610274 :   for (edge exit : exits)
     671              :     {
     672       465476 :       if (!get_loop_exit_condition (exit))
     673              :         {
     674        20025 :           if (dump_enabled_p ())
     675           14 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     676              :                              "Unhandled loop exit detected.\n");
     677        20025 :           return NULL;
     678              :         }
     679              : 
     680       445451 :       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
     681       445451 :           && !chrec_contains_undetermined (niter_desc.niter))
     682              :         {
     683       132468 :           tree may_be_zero = niter_desc.may_be_zero;
     684       132468 :           if ((integer_zerop (may_be_zero)
     685              :                /* As we are handling may_be_zero that's not false by
     686              :                   rewriting niter to may_be_zero ? 0 : niter we require
     687              :                   an empty latch.  */
     688       455812 :                || (single_pred_p (loop->latch)
     689        10026 :                    && exit->src == single_pred (loop->latch)
     690         2535 :                    && (integer_nonzerop (may_be_zero)
     691         2535 :                        || COMPARISON_CLASS_P (may_be_zero))))
     692       135003 :               && (!candidate
     693         5898 :                   || dominated_by_p (CDI_DOMINATORS, exit->src,
     694         5898 :                                      candidate->src)))
     695              :             candidate = exit;
     696              :         }
     697              :     }
     698              : 
     699              :   /* If no exit is analyzable by scalar evolution, we return the last exit
     700              :      under the assummption we are dealing with an uncounted loop.  */
     701       199153 :   if (!candidate && single_pred_p (loop->latch))
     702        34330 :     candidate = loop_exits_from_bb_p (loop, single_pred (loop->latch));
     703              : 
     704              :   return candidate;
     705       164823 : }
     706              : 
     707              : /* Function bb_in_loop_p
     708              : 
     709              :    Used as predicate for dfs order traversal of the loop bbs.  */
     710              : 
     711              : static bool
     712      1679355 : bb_in_loop_p (const_basic_block bb, const void *data)
     713              : {
     714      1679355 :   const class loop *const loop = (const class loop *)data;
     715      1679355 :   if (flow_bb_inside_loop_p (loop, bb))
     716              :     return true;
     717              :   return false;
     718              : }
     719              : 
     720              : 
     721              : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
     722              :    stmt_vec_info structs for all the stmts in LOOP_IN.  */
     723              : 
     724       575326 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     725              :   : vec_info (vec_info::loop, shared),
     726       575326 :     loop (loop_in),
     727       575326 :     num_itersm1 (NULL_TREE),
     728       575326 :     num_iters (NULL_TREE),
     729       575326 :     num_iters_unchanged (NULL_TREE),
     730       575326 :     num_iters_assumptions (NULL_TREE),
     731       575326 :     vector_costs (nullptr),
     732       575326 :     scalar_costs (nullptr),
     733       575326 :     th (0),
     734       575326 :     versioning_threshold (0),
     735       575326 :     vectorization_factor (0),
     736       575326 :     main_loop_edge (nullptr),
     737       575326 :     skip_main_loop_edge (nullptr),
     738       575326 :     skip_this_loop_edge (nullptr),
     739       575326 :     reusable_accumulators (),
     740       575326 :     suggested_unroll_factor (1),
     741       575326 :     max_vectorization_factor (0),
     742       575326 :     mask_skip_niters (NULL_TREE),
     743       575326 :     mask_skip_niters_pfa_offset (NULL_TREE),
     744       575326 :     rgroup_compare_type (NULL_TREE),
     745       575326 :     simd_if_cond (NULL_TREE),
     746       575326 :     partial_vector_style (vect_partial_vectors_none),
     747       575326 :     unaligned_dr (NULL),
     748       575326 :     peeling_for_alignment (0),
     749       575326 :     ptr_mask (0),
     750       575326 :     max_spec_read_amount (0),
     751       575326 :     nonlinear_iv (false),
     752       575326 :     ivexpr_map (NULL),
     753       575326 :     scan_map (NULL),
     754       575326 :     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
     755       575326 :     vectorizable (false),
     756       575326 :     can_use_partial_vectors_p (true),
     757       575326 :     must_use_partial_vectors_p (false),
     758       575326 :     using_partial_vectors_p (false),
     759       575326 :     using_decrementing_iv_p (false),
     760       575326 :     using_select_vl_p (false),
     761       575326 :     allow_mutual_alignment (false),
     762       575326 :     partial_load_store_bias (0),
     763       575326 :     peeling_for_gaps (false),
     764       575326 :     peeling_for_niter (false),
     765       575326 :     early_breaks (false),
     766       575326 :     loop_iv_cond (NULL),
     767       575326 :     user_unroll (false),
     768       575326 :     no_data_dependencies (false),
     769       575326 :     has_mask_store (false),
     770       575326 :     scalar_loop_scaling (profile_probability::uninitialized ()),
     771       575326 :     scalar_loop (NULL),
     772       575326 :     main_loop_info (NULL),
     773       575326 :     orig_loop_info (NULL),
     774       575326 :     epilogue_vinfo (NULL),
     775       575326 :     drs_advanced_by (NULL_TREE),
     776       575326 :     vec_loop_main_exit (NULL),
     777       575326 :     vec_epilogue_loop_main_exit (NULL),
     778       575326 :     scalar_loop_main_exit (NULL),
     779       575326 :     early_break_needs_epilogue (false),
     780       575326 :     early_break_niters_var (NULL)
     781              : {
     782              :   /* CHECKME: We want to visit all BBs before their successors (except for
     783              :      latch blocks, for which this assertion wouldn't hold).  In the simple
     784              :      case of the loop forms we allow, a dfs order of the BBs would the same
     785              :      as reversed postorder traversal, so we are safe.  */
     786              : 
     787       575326 :   bbs = XCNEWVEC (basic_block, loop->num_nodes);
     788      1150652 :   nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
     789       575326 :                              loop->num_nodes, loop);
     790       575326 :   gcc_assert (nbbs == loop->num_nodes);
     791              : 
     792      2003910 :   for (unsigned int i = 0; i < nbbs; i++)
     793              :     {
     794      1428584 :       basic_block bb = bbs[i];
     795      1428584 :       gimple_stmt_iterator si;
     796              : 
     797      2952232 :       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
     798              :         {
     799      1523648 :           gimple *phi = gsi_stmt (si);
     800      1523648 :           gimple_set_uid (phi, 0);
     801      1523648 :           add_stmt (phi);
     802              :         }
     803              : 
     804     13230563 :       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
     805              :         {
     806     10373395 :           gimple *stmt = gsi_stmt (si);
     807     10373395 :           gimple_set_uid (stmt, 0);
     808     10373395 :           if (is_gimple_debug (stmt))
     809      4405261 :             continue;
     810      5968134 :           add_stmt (stmt);
     811              :           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
     812              :              third argument is the #pragma omp simd if (x) condition, when 0,
     813              :              loop shouldn't be vectorized, when non-zero constant, it should
     814              :              be vectorized normally, otherwise versioned with vectorized loop
     815              :              done if the condition is non-zero at runtime.  */
     816      5968134 :           if (loop_in->simduid
     817        43372 :               && is_gimple_call (stmt)
     818         4268 :               && gimple_call_internal_p (stmt)
     819         4141 :               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
     820         4137 :               && gimple_call_num_args (stmt) >= 3
     821          103 :               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
     822      5968237 :               && (loop_in->simduid
     823          103 :                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
     824              :             {
     825          103 :               tree arg = gimple_call_arg (stmt, 2);
     826          103 :               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
     827          103 :                 simd_if_cond = arg;
     828              :               else
     829            0 :                 gcc_assert (integer_nonzerop (arg));
     830              :             }
     831              :         }
     832              :     }
     833       575326 : }
     834              : 
     835              : /* Free all levels of rgroup CONTROLS.  */
     836              : 
     837              : void
     838      1426505 : release_vec_loop_controls (vec<rgroup_controls> *controls)
     839              : {
     840      1426505 :   rgroup_controls *rgc;
     841      1426505 :   unsigned int i;
     842      1451011 :   FOR_EACH_VEC_ELT (*controls, i, rgc)
     843        24506 :     rgc->controls.release ();
     844      1426505 :   controls->release ();
     845      1426505 : }
     846              : 
     847              : /* Free all memory used by the _loop_vec_info, as well as all the
     848              :    stmt_vec_info structs of all the stmts in the loop.  */
     849              : 
     850       575326 : _loop_vec_info::~_loop_vec_info ()
     851              : {
     852       575326 :   free (bbs);
     853              : 
     854       575326 :   release_vec_loop_controls (&masks.rgc_vec);
     855       575326 :   release_vec_loop_controls (&lens);
     856       579219 :   delete ivexpr_map;
     857       575648 :   delete scan_map;
     858       575326 :   delete scalar_costs;
     859       575326 :   delete vector_costs;
     860       787471 :   for (auto reduc_info : reduc_infos)
     861       203618 :     delete reduc_info;
     862              : 
     863              :   /* When we release an epiloge vinfo that we do not intend to use
     864              :      avoid clearing AUX of the main loop which should continue to
     865              :      point to the main loop vinfo since otherwise we'll leak that.  */
     866       575326 :   if (loop->aux == this)
     867        61807 :     loop->aux = NULL;
     868      1150652 : }
     869              : 
     870              : /* Return an invariant or register for EXPR and emit necessary
     871              :    computations in the LOOP_VINFO loop preheader.  */
     872              : 
     873              : tree
     874        20502 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
     875              : {
     876        20502 :   if (is_gimple_reg (expr)
     877        20502 :       || is_gimple_min_invariant (expr))
     878         6956 :     return expr;
     879              : 
     880        13546 :   if (! loop_vinfo->ivexpr_map)
     881         3893 :     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
     882        13546 :   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
     883        13546 :   if (! cached)
     884              :     {
     885         8676 :       gimple_seq stmts = NULL;
     886         8676 :       cached = force_gimple_operand (unshare_expr (expr),
     887              :                                      &stmts, true, NULL_TREE);
     888         8676 :       if (stmts)
     889              :         {
     890         8528 :           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
     891         8528 :           gsi_insert_seq_on_edge_immediate (e, stmts);
     892              :         }
     893              :     }
     894        13546 :   return cached;
     895              : }
     896              : 
     897              : /* Return true if we can use CMP_TYPE as the comparison type to produce
     898              :    all masks required to mask LOOP_VINFO.  */
     899              : 
     900              : static bool
     901       109782 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
     902              : {
     903       109782 :   rgroup_controls *rgm;
     904       109782 :   unsigned int i;
     905       125504 :   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
     906       125504 :     if (rgm->type != NULL_TREE
     907       125504 :         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
     908              :                                             cmp_type, rgm->type,
     909              :                                             OPTIMIZE_FOR_SPEED))
     910              :       return false;
     911              :   return true;
     912              : }
     913              : 
     914              : /* Calculate the maximum number of scalars per iteration for every
     915              :    rgroup in LOOP_VINFO.  */
     916              : 
     917              : static unsigned int
     918        23393 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
     919              : {
     920        23393 :   unsigned int res = 1;
     921        23393 :   unsigned int i;
     922        23393 :   rgroup_controls *rgm;
     923        56100 :   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
     924        32707 :     res = MAX (res, rgm->max_nscalars_per_iter);
     925        23393 :   return res;
     926              : }
     927              : 
     928              : /* Calculate the minimum precision necessary to represent:
     929              : 
     930              :       MAX_NITERS * FACTOR
     931              : 
     932              :    as an unsigned integer, where MAX_NITERS is the maximum number of
     933              :    loop header iterations for the original scalar form of LOOP_VINFO.  */
     934              : 
     935              : unsigned
     936        25774 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
     937              : {
     938        25774 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
     939              : 
     940              :   /* Get the maximum number of iterations that is representable
     941              :      in the counter type.  */
     942        25774 :   tree ni_type;
     943        25774 :   if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
     944        25774 :     ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
     945              :   else
     946            0 :     ni_type = sizetype;
     947        25774 :   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
     948              : 
     949              :   /* Get a more refined estimate for the number of iterations.  */
     950        25774 :   widest_int max_back_edges;
     951        25774 :   if (max_loop_iterations (loop, &max_back_edges))
     952        25774 :     max_ni = wi::smin (max_ni, max_back_edges + 1);
     953              : 
     954              :   /* Work out how many bits we need to represent the limit.  */
     955        25774 :   return wi::min_precision (max_ni * factor, UNSIGNED);
     956        25774 : }
     957              : 
     958              : /* True if the loop needs peeling or partial vectors when vectorized.  */
     959              : 
     960              : static bool
     961       155269 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
     962              : {
     963       155269 :   unsigned HOST_WIDE_INT const_vf;
     964              : 
     965       155269 :   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
     966              :     return true;
     967              : 
     968        13362 :   loop_vec_info main_loop_vinfo
     969       153983 :     = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
     970       153983 :        ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
     971       153983 :   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
     972        79008 :       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
     973              :     {
     974              :       /* Work out the (constant) number of iterations that need to be
     975              :          peeled for reasons other than niters.  */
     976        78958 :       unsigned int peel_niter
     977              :         = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
     978        78958 :       return !multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
     979        78958 :                           LOOP_VINFO_VECT_FACTOR (loop_vinfo));
     980              :     }
     981              : 
     982        75025 :   if (!LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
     983        75025 :       && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
     984              :     {
     985              :       /* When the number of iterations is a multiple of the vectorization
     986              :          factor and we are not doing prologue or forced epilogue peeling
     987              :          the epilogue isn't necessary.  */
     988        74611 :       if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
     989       149222 :           >= (unsigned) exact_log2 (const_vf))
     990              :         return false;
     991              :     }
     992              : 
     993              :   return true;
     994              : }
     995              : 
     996              : /* Each statement in LOOP_VINFO can be masked where necessary.  Check
     997              :    whether we can actually generate the masks required.  Return true if so,
     998              :    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
     999              : 
    1000              : static bool
    1001        23393 : vect_verify_full_masking (loop_vec_info loop_vinfo)
    1002              : {
    1003        23393 :   unsigned int min_ni_width;
    1004              : 
    1005              :   /* Use a normal loop if there are no statements that need masking.
    1006              :      This only happens in rare degenerate cases: it means that the loop
    1007              :      has no loads, no stores, and no live-out values.  */
    1008        23393 :   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
    1009              :     return false;
    1010              : 
    1011              :   /* Produce the rgroup controls.  */
    1012        92189 :   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
    1013              :     {
    1014        34398 :       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
    1015        34398 :       tree vectype = mask.first;
    1016        34398 :       unsigned nvectors = mask.second;
    1017              : 
    1018        45403 :       if (masks->rgc_vec.length () < nvectors)
    1019        25614 :         masks->rgc_vec.safe_grow_cleared (nvectors, true);
    1020        34398 :       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
    1021              :       /* The number of scalars per iteration and the number of vectors are
    1022              :          both compile-time constants.  */
    1023        34398 :       unsigned int nscalars_per_iter
    1024        34398 :           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
    1025        34398 :                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
    1026              : 
    1027        34398 :       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
    1028              :         {
    1029        27427 :           rgm->max_nscalars_per_iter = nscalars_per_iter;
    1030        27427 :           rgm->type = truth_type_for (vectype);
    1031        27427 :           rgm->factor = 1;
    1032              :         }
    1033              :     }
    1034              : 
    1035        23393 :   unsigned int max_nscalars_per_iter
    1036        23393 :     = vect_get_max_nscalars_per_iter (loop_vinfo);
    1037              : 
    1038              :   /* Work out how many bits we need to represent the limit.  */
    1039        23393 :   min_ni_width
    1040        23393 :     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
    1041              : 
    1042              :   /* Find a scalar mode for which WHILE_ULT is supported.  */
    1043        23393 :   opt_scalar_int_mode cmp_mode_iter;
    1044        23393 :   tree cmp_type = NULL_TREE;
    1045        23393 :   tree iv_type = NULL_TREE;
    1046        23393 :   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
    1047        23393 :   unsigned int iv_precision = UINT_MAX;
    1048              : 
    1049        23393 :   if (iv_limit != -1)
    1050        23393 :     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
    1051              :                                       UNSIGNED);
    1052              : 
    1053       187144 :   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
    1054              :     {
    1055       163751 :       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
    1056       163751 :       if (cmp_bits >= min_ni_width
    1057       163751 :           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
    1058              :         {
    1059       109782 :           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
    1060       109782 :           if (this_type
    1061       109782 :               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
    1062              :             {
    1063              :               /* Although we could stop as soon as we find a valid mode,
    1064              :                  there are at least two reasons why that's not always the
    1065              :                  best choice:
    1066              : 
    1067              :                  - An IV that's Pmode or wider is more likely to be reusable
    1068              :                    in address calculations than an IV that's narrower than
    1069              :                    Pmode.
    1070              : 
    1071              :                  - Doing the comparison in IV_PRECISION or wider allows
    1072              :                    a natural 0-based IV, whereas using a narrower comparison
    1073              :                    type requires mitigations against wrap-around.
    1074              : 
    1075              :                  Conversely, if the IV limit is variable, doing the comparison
    1076              :                  in a wider type than the original type can introduce
    1077              :                  unnecessary extensions, so picking the widest valid mode
    1078              :                  is not always a good choice either.
    1079              : 
    1080              :                  Here we prefer the first IV type that's Pmode or wider,
    1081              :                  and the first comparison type that's IV_PRECISION or wider.
    1082              :                  (The comparison type must be no wider than the IV type,
    1083              :                  to avoid extensions in the vector loop.)
    1084              : 
    1085              :                  ??? We might want to try continuing beyond Pmode for ILP32
    1086              :                  targets if CMP_BITS < IV_PRECISION.  */
    1087            0 :               iv_type = this_type;
    1088            0 :               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
    1089              :                 cmp_type = this_type;
    1090            0 :               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
    1091              :                 break;
    1092              :             }
    1093              :         }
    1094              :     }
    1095              : 
    1096        23393 :   if (!cmp_type)
    1097              :     {
    1098        23393 :       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
    1099        23393 :       return false;
    1100              :     }
    1101              : 
    1102            0 :   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
    1103            0 :   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
    1104            0 :   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
    1105            0 :   return true;
    1106        23393 : }
    1107              : 
    1108              : /* Each statement in LOOP_VINFO can be masked where necessary.  Check
    1109              :    whether we can actually generate AVX512 style masks.  Return true if so,
    1110              :    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
    1111              : 
    1112              : static bool
    1113        23393 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
    1114              : {
    1115              :   /* Produce differently organized rgc_vec and differently check
    1116              :      we can produce masks.  */
    1117              : 
    1118              :   /* Use a normal loop if there are no statements that need masking.
    1119              :      This only happens in rare degenerate cases: it means that the loop
    1120              :      has no loads, no stores, and no live-out values.  */
    1121        23393 :   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
    1122              :     return false;
    1123              : 
    1124              :   /* For the decrementing IV we need to represent all values in
    1125              :      [0, niter + niter_skip] where niter_skip is the elements we
    1126              :      skip in the first iteration for prologue peeling.  */
    1127        23393 :   tree iv_type = NULL_TREE;
    1128        23393 :   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
    1129        23393 :   unsigned int iv_precision = UINT_MAX;
    1130        23393 :   if (iv_limit != -1)
    1131        23393 :     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
    1132              : 
    1133              :   /* First compute the type for the IV we use to track the remaining
    1134              :      scalar iterations.  */
    1135        23393 :   opt_scalar_int_mode cmp_mode_iter;
    1136        30550 :   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
    1137              :     {
    1138        30550 :       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
    1139        30550 :       if (cmp_bits >= iv_precision
    1140        30550 :           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
    1141              :         {
    1142        23393 :           iv_type = build_nonstandard_integer_type (cmp_bits, true);
    1143        23393 :           if (iv_type)
    1144              :             break;
    1145              :         }
    1146              :     }
    1147        23393 :   if (!iv_type)
    1148              :     return false;
    1149              : 
    1150              :   /* Produce the rgroup controls.  */
    1151        92189 :   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
    1152              :     {
    1153        34398 :       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
    1154        34398 :       tree vectype = mask.first;
    1155        34398 :       unsigned nvectors = mask.second;
    1156              : 
    1157              :       /* The number of scalars per iteration and the number of vectors are
    1158              :          both compile-time constants.  */
    1159        34398 :       unsigned int nscalars_per_iter
    1160        34398 :         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
    1161        34398 :                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
    1162              : 
    1163              :       /* We index the rgroup_controls vector with nscalars_per_iter
    1164              :          which we keep constant and instead have a varying nvectors,
    1165              :          remembering the vector mask with the fewest nV.  */
    1166        45403 :       if (masks->rgc_vec.length () < nscalars_per_iter)
    1167        23447 :         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
    1168        34398 :       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
    1169              : 
    1170        34398 :       if (!rgm->type || rgm->factor > nvectors)
    1171              :         {
    1172        25263 :           rgm->type = truth_type_for (vectype);
    1173        25263 :           rgm->compare_type = NULL_TREE;
    1174        25263 :           rgm->max_nscalars_per_iter = nscalars_per_iter;
    1175        25263 :           rgm->factor = nvectors;
    1176        25263 :           rgm->bias_adjusted_ctrl = NULL_TREE;
    1177              :         }
    1178              :     }
    1179              : 
    1180              :   /* There is no fixed compare type we are going to use but we have to
    1181              :      be able to get at one for each mask group.  */
    1182        23393 :   unsigned int min_ni_width
    1183        23393 :     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
    1184              : 
    1185        23393 :   bool ok = true;
    1186        88574 :   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
    1187              :     {
    1188        24458 :       tree mask_type = rgc.type;
    1189        24458 :       if (!mask_type)
    1190          986 :         continue;
    1191              : 
    1192              :       /* For now vect_get_loop_mask only supports integer mode masks
    1193              :          when we need to split it.  */
    1194        23472 :       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
    1195        23472 :           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
    1196              :         {
    1197              :           ok = false;
    1198              :           break;
    1199              :         }
    1200              : 
    1201              :       /* If iv_type is usable as compare type use that - we can elide the
    1202              :          saturation in that case.   */
    1203        17413 :       if (TYPE_PRECISION (iv_type) >= min_ni_width)
    1204              :         {
    1205        17413 :           tree cmp_vectype
    1206        17413 :             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
    1207        17413 :           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
    1208         5930 :             rgc.compare_type = cmp_vectype;
    1209              :         }
    1210        17413 :       if (!rgc.compare_type)
    1211        33019 :         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
    1212              :           {
    1213        33015 :             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
    1214        33015 :             if (cmp_bits >= min_ni_width
    1215        33015 :                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
    1216              :               {
    1217        33003 :                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
    1218        33003 :                 if (!cmp_type)
    1219            0 :                   continue;
    1220              : 
    1221              :                 /* Check whether we can produce the mask with cmp_type.  */
    1222        33003 :                 tree cmp_vectype
    1223        33003 :                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
    1224        33003 :                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
    1225              :                   {
    1226        11479 :                     rgc.compare_type = cmp_vectype;
    1227        11479 :                     break;
    1228              :                   }
    1229              :               }
    1230              :         }
    1231        17413 :       if (!rgc.compare_type)
    1232              :         {
    1233              :           ok = false;
    1234              :           break;
    1235              :         }
    1236              :     }
    1237        23393 :   if (!ok)
    1238              :     {
    1239         6063 :       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
    1240         6063 :       return false;
    1241              :     }
    1242              : 
    1243        17330 :   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
    1244        17330 :   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
    1245        17330 :   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
    1246        17330 :   return true;
    1247        23393 : }
    1248              : 
    1249              : /* Check whether we can use vector access with length based on precision
    1250              :    comparison.  So far, to keep it simple, we only allow the case that the
    1251              :    precision of the target supported length is larger than the precision
    1252              :    required by loop niters.  */
    1253              : 
    1254              : static bool
    1255            6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
    1256              : {
    1257            6 :   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
    1258              :     return false;
    1259              : 
    1260            0 :   if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
    1261              :     return false;
    1262              : 
    1263            0 :   machine_mode len_load_mode, len_store_mode;
    1264            0 :   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
    1265            0 :          .exists (&len_load_mode))
    1266            0 :     return false;
    1267            0 :   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
    1268            0 :          .exists (&len_store_mode))
    1269            0 :     return false;
    1270              : 
    1271            0 :   signed char partial_load_bias = internal_len_load_store_bias
    1272            0 :     (IFN_LEN_LOAD, len_load_mode);
    1273              : 
    1274            0 :   signed char partial_store_bias = internal_len_load_store_bias
    1275            0 :     (IFN_LEN_STORE, len_store_mode);
    1276              : 
    1277            0 :   gcc_assert (partial_load_bias == partial_store_bias);
    1278              : 
    1279            0 :   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
    1280              :     return false;
    1281              : 
    1282              :   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
    1283              :      len_loads with a length of zero.  In order to avoid that we prohibit
    1284              :      more than one loop length here.  */
    1285            0 :   if (partial_load_bias == -1
    1286            0 :       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
    1287              :     return false;
    1288              : 
    1289            0 :   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
    1290              : 
    1291            0 :   unsigned int max_nitems_per_iter = 1;
    1292            0 :   unsigned int i;
    1293            0 :   rgroup_controls *rgl;
    1294              :   /* Find the maximum number of items per iteration for every rgroup.  */
    1295            0 :   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
    1296              :     {
    1297            0 :       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
    1298            0 :       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
    1299              :     }
    1300              : 
    1301              :   /* Work out how many bits we need to represent the length limit.  */
    1302            0 :   unsigned int min_ni_prec
    1303            0 :     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
    1304              : 
    1305              :   /* Now use the maximum of below precisions for one suitable IV type:
    1306              :      - the IV's natural precision
    1307              :      - the precision needed to hold: the maximum number of scalar
    1308              :        iterations multiplied by the scale factor (min_ni_prec above)
    1309              :      - the Pmode precision
    1310              : 
    1311              :      If min_ni_prec is less than the precision of the current niters,
    1312              :      we prefer to still use the niters type.  Prefer to use Pmode and
    1313              :      wider IV to avoid narrow conversions.  */
    1314              : 
    1315            0 :   unsigned int ni_prec
    1316            0 :     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
    1317            0 :   min_ni_prec = MAX (min_ni_prec, ni_prec);
    1318            0 :   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
    1319              : 
    1320            0 :   tree iv_type = NULL_TREE;
    1321            0 :   opt_scalar_int_mode tmode_iter;
    1322            0 :   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
    1323              :     {
    1324            0 :       scalar_mode tmode = tmode_iter.require ();
    1325            0 :       unsigned int tbits = GET_MODE_BITSIZE (tmode);
    1326              : 
    1327              :       /* ??? Do we really want to construct one IV whose precision exceeds
    1328              :          BITS_PER_WORD?  */
    1329            0 :       if (tbits > BITS_PER_WORD)
    1330              :         break;
    1331              : 
    1332              :       /* Find the first available standard integral type.  */
    1333            0 :       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
    1334              :         {
    1335            0 :           iv_type = build_nonstandard_integer_type (tbits, true);
    1336            0 :           break;
    1337              :         }
    1338              :     }
    1339              : 
    1340            0 :   if (!iv_type)
    1341              :     {
    1342            0 :       if (dump_enabled_p ())
    1343            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1344              :                          "can't vectorize with length-based partial vectors"
    1345              :                          " because there is no suitable iv type.\n");
    1346            0 :       return false;
    1347              :     }
    1348              : 
    1349            0 :   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
    1350            0 :   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
    1351            0 :   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
    1352              : 
    1353            0 :   return true;
    1354              : }
    1355              : 
    1356              : /* Calculate the cost of one scalar iteration of the loop.  */
    1357              : static void
    1358       363616 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
    1359              : {
    1360       363616 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    1361       363616 :   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    1362       363616 :   int nbbs = loop->num_nodes, factor;
    1363       363616 :   int innerloop_iters, i;
    1364              : 
    1365       363616 :   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
    1366              : 
    1367              :   /* Gather costs for statements in the scalar loop.  */
    1368              : 
    1369              :   /* FORNOW.  */
    1370       363616 :   innerloop_iters = 1;
    1371       363616 :   if (loop->inner)
    1372         1626 :     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
    1373              : 
    1374      1249696 :   for (i = 0; i < nbbs; i++)
    1375              :     {
    1376       886080 :       gimple_stmt_iterator si;
    1377       886080 :       basic_block bb = bbs[i];
    1378              : 
    1379       886080 :       if (bb->loop_father == loop->inner)
    1380              :         factor = innerloop_iters;
    1381              :       else
    1382       882828 :         factor = 1;
    1383              : 
    1384      7284422 :       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
    1385              :         {
    1386      5512262 :           gimple *stmt = gsi_stmt (si);
    1387      5512262 :           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
    1388              : 
    1389      5512262 :           if (!is_gimple_assign (stmt)
    1390              :               && !is_gimple_call (stmt)
    1391              :               && !is_a<gcond *> (stmt))
    1392      1975585 :             continue;
    1393              : 
    1394              :           /* Skip stmts that are not vectorized inside the loop.  */
    1395      3536677 :           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
    1396      3536677 :           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
    1397      1741447 :               && (!STMT_VINFO_LIVE_P (vstmt_info)
    1398           53 :                   || !VECTORIZABLE_CYCLE_DEF
    1399              :                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
    1400      1741447 :             continue;
    1401              : 
    1402      1795230 :           vect_cost_for_stmt kind;
    1403      1795230 :           if (STMT_VINFO_DATA_REF (stmt_info))
    1404              :             {
    1405       863942 :               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    1406              :                kind = scalar_load;
    1407              :              else
    1408       323206 :                kind = scalar_store;
    1409              :             }
    1410       931288 :           else if (vect_nop_conversion_p (stmt_info))
    1411        53761 :             continue;
    1412              :           else
    1413              :             kind = scalar_stmt;
    1414              : 
    1415              :           /* We are using vect_prologue here to avoid scaling twice
    1416              :              by the inner loop factor.  */
    1417      1741469 :           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
    1418              :                             factor, kind, stmt_info, 0, vect_body);
    1419              :         }
    1420              :     }
    1421              : 
    1422              :   /* Now accumulate cost.  */
    1423       363616 :   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
    1424       363616 :   add_stmt_costs (loop_vinfo->scalar_costs,
    1425              :                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
    1426       363616 :   loop_vinfo->scalar_costs->finish_cost (nullptr);
    1427       363616 : }
    1428              : 
    1429              : /* Function vect_analyze_loop_form.
    1430              : 
    1431              :    Verify that certain CFG restrictions hold, including:
    1432              :    - the loop has a pre-header
    1433              :    - the loop has a single entry
    1434              :    - nested loops can have only a single exit.
    1435              :    - the loop exit condition is simple enough
    1436              :    - the number of iterations can be analyzed, i.e, a countable loop.  The
    1437              :      niter could be analyzed under some assumptions.  */
    1438              : 
    1439              : opt_result
    1440       454156 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
    1441              :                         vect_loop_form_info *info)
    1442              : {
    1443       454156 :   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
    1444              : 
    1445       454156 :   edge exit_e = vec_init_loop_exit_info (loop);
    1446       454156 :   if (!exit_e)
    1447        29182 :     return opt_result::failure_at (vect_location,
    1448              :                                    "not vectorized:"
    1449              :                                    " Infinite loop detected.\n");
    1450       424974 :   if (loop_vectorized_call)
    1451              :     {
    1452        28761 :       tree arg = gimple_call_arg (loop_vectorized_call, 1);
    1453        28761 :       class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
    1454        28761 :       edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
    1455        28761 :       if (!scalar_exit_e)
    1456            0 :         return opt_result::failure_at (vect_location,
    1457              :                                        "not vectorized:"
    1458              :                                        " could not determine main exit from"
    1459              :                                        " loop with multiple exits.\n");
    1460              :     }
    1461              : 
    1462       424974 :   info->loop_exit = exit_e;
    1463       424974 :   if (dump_enabled_p ())
    1464        16041 :       dump_printf_loc (MSG_NOTE, vect_location,
    1465              :                        "using as main loop exit: %d -> %d [AUX: %p]\n",
    1466        16041 :                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
    1467              : 
    1468              :   /* Check if we have any control flow that doesn't leave the loop.  */
    1469       424974 :   basic_block *bbs = get_loop_body (loop);
    1470      1390602 :   for (unsigned i = 0; i < loop->num_nodes; i++)
    1471      1081009 :     if (EDGE_COUNT (bbs[i]->succs) != 1
    1472      1081009 :         && (EDGE_COUNT (bbs[i]->succs) != 2
    1473       646677 :             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
    1474              :       {
    1475       115381 :         free (bbs);
    1476       115381 :         return opt_result::failure_at (vect_location,
    1477              :                                        "not vectorized:"
    1478              :                                        " unsupported control flow in loop.\n");
    1479              :       }
    1480              : 
    1481              :   /* Check if we have any control flow that doesn't leave the loop.  */
    1482       310676 :   bool has_phi = false;
    1483       310676 :   for (unsigned i = 0; i < loop->num_nodes; i++)
    1484       310225 :     if (!gimple_seq_empty_p (phi_nodes (bbs[i])))
    1485              :       {
    1486              :         has_phi = true;
    1487              :         break;
    1488              :       }
    1489       309593 :   if (!has_phi)
    1490          451 :     return opt_result::failure_at (vect_location,
    1491              :                                    "not vectorized:"
    1492              :                                    " no scalar evolution detected in loop.\n");
    1493              : 
    1494       309142 :   free (bbs);
    1495              : 
    1496              :   /* Different restrictions apply when we are considering an inner-most loop,
    1497              :      vs. an outer (nested) loop.
    1498              :      (FORNOW. May want to relax some of these restrictions in the future).  */
    1499              : 
    1500       309142 :   info->inner_loop_cond = NULL;
    1501       309142 :   if (!loop->inner)
    1502              :     {
    1503              :       /* Inner-most loop.  */
    1504              : 
    1505       290604 :       if (empty_block_p (loop->header))
    1506            0 :         return opt_result::failure_at (vect_location,
    1507              :                                        "not vectorized: empty loop.\n");
    1508              :     }
    1509              :   else
    1510              :     {
    1511        18538 :       class loop *innerloop = loop->inner;
    1512        18538 :       edge entryedge;
    1513              : 
    1514              :       /* Nested loop. We currently require that the loop is doubly-nested,
    1515              :          contains a single inner loop with a single exit to the block
    1516              :          with the single exit condition in the outer loop.
    1517              :          Vectorizable outer-loops look like this:
    1518              : 
    1519              :                         (pre-header)
    1520              :                            |
    1521              :                           header <---+
    1522              :                            |         |
    1523              :                           inner-loop |
    1524              :                            |         |
    1525              :                           tail ------+
    1526              :                            |
    1527              :                         (exit-bb)
    1528              : 
    1529              :          The inner-loop also has the properties expected of inner-most loops
    1530              :          as described above.  */
    1531              : 
    1532        18538 :       if ((loop->inner)->inner || (loop->inner)->next)
    1533         2968 :         return opt_result::failure_at (vect_location,
    1534              :                                        "not vectorized:"
    1535              :                                        " multiple nested loops.\n");
    1536              : 
    1537        15570 :       entryedge = loop_preheader_edge (innerloop);
    1538        15570 :       if (entryedge->src != loop->header
    1539        15052 :           || !single_exit (innerloop)
    1540        26989 :           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
    1541         4445 :         return opt_result::failure_at (vect_location,
    1542              :                                        "not vectorized:"
    1543              :                                        " unsupported outerloop form.\n");
    1544              : 
    1545              :       /* Analyze the inner-loop.  */
    1546        11125 :       vect_loop_form_info inner;
    1547        11125 :       opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
    1548        11125 :       if (!res)
    1549              :         {
    1550          416 :           if (dump_enabled_p ())
    1551            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1552              :                              "not vectorized: Bad inner loop.\n");
    1553          416 :           return res;
    1554              :         }
    1555              : 
    1556              :       /* Don't support analyzing niter under assumptions for inner
    1557              :          loop.  */
    1558        10709 :       if (!integer_onep (inner.assumptions))
    1559          257 :         return opt_result::failure_at (vect_location,
    1560              :                                        "not vectorized: Bad inner loop.\n");
    1561              : 
    1562        10452 :       if (inner.number_of_iterations ==  chrec_dont_know
    1563        10452 :           || !expr_invariant_in_loop_p (loop, inner.number_of_iterations))
    1564         1837 :         return opt_result::failure_at (vect_location,
    1565              :                                        "not vectorized: inner-loop count not"
    1566              :                                        " invariant.\n");
    1567              : 
    1568         8615 :       if (dump_enabled_p ())
    1569         1050 :         dump_printf_loc (MSG_NOTE, vect_location,
    1570              :                          "Considering outer-loop vectorization.\n");
    1571         8615 :       info->inner_loop_cond = inner.conds[0];
    1572        11125 :     }
    1573              : 
    1574       299219 :   if (EDGE_COUNT (loop->header->preds) != 2)
    1575            0 :     return opt_result::failure_at (vect_location,
    1576              :                                    "not vectorized:"
    1577              :                                    " too many incoming edges.\n");
    1578              : 
    1579              :   /* We assume that the latch is empty.  */
    1580       299219 :   basic_block latch = loop->latch;
    1581       299219 :   do
    1582              :     {
    1583       299219 :       if (!empty_block_p (latch)
    1584       299219 :           || !gimple_seq_empty_p (phi_nodes (latch)))
    1585        20671 :         return opt_result::failure_at (vect_location,
    1586              :                                        "not vectorized: latch block not "
    1587              :                                        "empty.\n");
    1588       278548 :       latch = single_pred (latch);
    1589              :     }
    1590       557096 :   while (single_succ_p (latch));
    1591              : 
    1592              :   /* Make sure there is no abnormal exit.  */
    1593       278548 :   auto_vec<edge> exits = get_loop_exit_edges (loop);
    1594       956164 :   for (edge e : exits)
    1595              :     {
    1596       399101 :       if (e->flags & EDGE_ABNORMAL)
    1597           33 :         return opt_result::failure_at (vect_location,
    1598              :                                        "not vectorized:"
    1599              :                                        " abnormal loop exit edge.\n");
    1600              :     }
    1601              : 
    1602       278515 :   info->conds
    1603       278515 :     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
    1604              :                             &info->number_of_iterations,
    1605       278515 :                             &info->number_of_iterationsm1);
    1606       278515 :   if (info->conds.is_empty ())
    1607           33 :     return opt_result::failure_at
    1608           33 :       (vect_location,
    1609              :        "not vectorized: complicated exit condition.\n");
    1610              : 
    1611              :   /* Determine what the primary and alternate exit conds are.  */
    1612       677517 :   for (unsigned i = 0; i < info->conds.length (); i++)
    1613              :     {
    1614       399035 :       gcond *cond = info->conds[i];
    1615       399035 :       if (exit_e->src == gimple_bb (cond))
    1616       278482 :         std::swap (info->conds[0], info->conds[i]);
    1617              :     }
    1618              : 
    1619       278482 :   if (chrec_contains_undetermined (info->number_of_iterations))
    1620              :     {
    1621        58214 :       if (dump_enabled_p ())
    1622          259 :         dump_printf_loc (MSG_NOTE, vect_location,
    1623              :                          "Loop being analyzed as uncounted.\n");
    1624        58214 :       if (loop->inner)
    1625          562 :         return opt_result::failure_at
    1626          562 :           (vect_location,
    1627              :            "not vectorized: outer loop vectorization of uncounted loops"
    1628              :            " is unsupported.\n");
    1629        57652 :       return opt_result::success ();
    1630              :     }
    1631              : 
    1632       220268 :   if (integer_zerop (info->assumptions))
    1633            4 :     return opt_result::failure_at
    1634            4 :       (info->conds[0],
    1635              :        "not vectorized: number of iterations cannot be computed.\n");
    1636              : 
    1637       220264 :   if (integer_zerop (info->number_of_iterations))
    1638           12 :     return opt_result::failure_at
    1639           12 :       (info->conds[0],
    1640              :        "not vectorized: number of iterations = 0.\n");
    1641              : 
    1642       220252 :   if (!(tree_fits_shwi_p (info->number_of_iterations)
    1643       121238 :         && tree_to_shwi (info->number_of_iterations) > 0))
    1644              :     {
    1645        99014 :       if (dump_enabled_p ())
    1646              :         {
    1647         2481 :           dump_printf_loc (MSG_NOTE, vect_location,
    1648              :                            "Symbolic number of iterations is ");
    1649         2481 :           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
    1650         2481 :           dump_printf (MSG_NOTE, "\n");
    1651              :         }
    1652              :     }
    1653              : 
    1654       220252 :   if (!integer_onep (info->assumptions))
    1655              :     {
    1656         8623 :       if (dump_enabled_p ())
    1657              :         {
    1658           68 :           dump_printf_loc (MSG_NOTE, vect_location,
    1659              :                            "Loop to be versioned with niter assumption ");
    1660           68 :           dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
    1661           68 :           dump_printf (MSG_NOTE, "\n");
    1662              :         }
    1663              :     }
    1664              : 
    1665       220252 :   return opt_result::success ();
    1666       278548 : }
    1667              : 
    1668              : /* Create a loop_vec_info for LOOP with SHARED and the
    1669              :    vect_analyze_loop_form result.  */
    1670              : 
    1671              : loop_vec_info
    1672       575326 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
    1673              :                         const vect_loop_form_info *info,
    1674              :                         loop_vec_info orig_loop_info)
    1675              : {
    1676       575326 :   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
    1677       575326 :   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
    1678       575326 :   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
    1679       575326 :   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
    1680       575326 :   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
    1681       575326 :   if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
    1682          344 :     LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
    1683          344 :       = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
    1684              :   else
    1685       574982 :     LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
    1686              :   /* Also record the assumptions for versioning.  */
    1687       575326 :   if (!integer_onep (info->assumptions) && !orig_loop_info)
    1688        19517 :     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
    1689              : 
    1690      2552075 :   for (gcond *cond : info->conds)
    1691              :     {
    1692       826097 :       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
    1693              :       /* Mark the statement as a condition.  */
    1694       826097 :       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
    1695              :     }
    1696              : 
    1697       575326 :   unsigned cond_id = 0;
    1698       575326 :   if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
    1699       491225 :     LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[cond_id++];
    1700              : 
    1701       910198 :   for (; cond_id < info->conds.length (); cond_id ++)
    1702       334872 :     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[cond_id]);
    1703              : 
    1704       575326 :   LOOP_VINFO_MAIN_EXIT (loop_vinfo) = info->loop_exit;
    1705              : 
    1706              :   /* Check to see if we're vectorizing multiple exits.  */
    1707       575326 :   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
    1708       575326 :     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
    1709              : 
    1710              :   /* At the moment we can't support no epilogs for multiple exits, result of
    1711              :      the first compare should be masked by that of the second.  We can only
    1712              :      allow it if the early exits have the same live values.  for differing
    1713              :      values we have to calculate a third mask to disambiguate. */
    1714       575326 :   LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo)
    1715       575326 :     = LOOP_VINFO_LOOP_CONDS (loop_vinfo).length () > 1;
    1716              : 
    1717       575326 :   if (info->inner_loop_cond)
    1718              :     {
    1719              :       /* If we have an estimate on the number of iterations of the inner
    1720              :          loop use that to limit the scale for costing, otherwise use
    1721              :          --param vect-inner-loop-cost-factor literally.  */
    1722         9042 :       widest_int nit;
    1723         9042 :       if (estimated_stmt_executions (loop->inner, &nit))
    1724         7735 :         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
    1725         7735 :           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
    1726         9042 :     }
    1727              : 
    1728       575326 :   return loop_vinfo;
    1729              : }
    1730              : 
    1731              : 
    1732              : 
    1733              : /* Return true if we know that the iteration count is smaller than the
    1734              :    vectorization factor.  Return false if it isn't, or if we can't be sure
    1735              :    either way.  */
    1736              : 
    1737              : static bool
    1738       154345 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
    1739              : {
    1740       154345 :   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
    1741              : 
    1742       154345 :   HOST_WIDE_INT max_niter;
    1743       154345 :   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
    1744        79233 :     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
    1745              :   else
    1746        75112 :     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
    1747              : 
    1748       154345 :   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
    1749        10936 :     return true;
    1750              : 
    1751              :   return false;
    1752              : }
    1753              : 
    1754              : /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
    1755              :    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
    1756              :    definitely no, or -1 if it's worth retrying.  */
    1757              : 
    1758              : static int
    1759       154354 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
    1760              :                            unsigned *suggested_unroll_factor)
    1761              : {
    1762       154354 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    1763       154354 :   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
    1764              : 
    1765              :   /* Only loops that can handle partially-populated vectors can have iteration
    1766              :      counts less than the vectorization factor.  */
    1767       154354 :   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
    1768       154354 :       && vect_known_niters_smaller_than_vf (loop_vinfo))
    1769              :     {
    1770        10926 :       if (dump_enabled_p ())
    1771          236 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1772              :                          "not vectorized: iteration count smaller than "
    1773              :                          "vectorization factor.\n");
    1774        10926 :       return 0;
    1775              :     }
    1776              : 
    1777              :   /* If we know the number of iterations we can do better, for the
    1778              :      epilogue we can also decide whether the main loop leaves us
    1779              :      with enough iterations, preferring a smaller vector epilog then
    1780              :      also possibly used for the case we skip the vector loop.  */
    1781       143428 :   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
    1782              :     {
    1783        69582 :       widest_int scalar_niters
    1784        69582 :         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
    1785        69582 :       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
    1786              :         {
    1787         2639 :           loop_vec_info orig_loop_vinfo
    1788              :             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
    1789         2639 :           loop_vec_info main_loop_vinfo
    1790              :             = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
    1791         2639 :           unsigned lowest_vf
    1792         2639 :             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
    1793         2639 :           int prolog_peeling = 0;
    1794         2639 :           if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
    1795         2639 :             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
    1796         2639 :           if (prolog_peeling >= 0
    1797         2639 :               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
    1798              :                            lowest_vf))
    1799              :             {
    1800         5268 :               unsigned gap
    1801         2634 :                 = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
    1802         5268 :               scalar_niters = ((scalar_niters - gap - prolog_peeling)
    1803         5268 :                                % lowest_vf + gap);
    1804              :             }
    1805              :         }
    1806              :       /* Reject vectorizing for a single scalar iteration, even if
    1807              :          we could in principle implement that using partial vectors.
    1808              :          But allow such vectorization if VF == 1 in case we do not
    1809              :          need to peel for gaps (if we need, avoid vectorization for
    1810              :          reasons of code footprint).  */
    1811        69582 :       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
    1812        69582 :       if (scalar_niters <= peeling_gap + 1
    1813        69582 :           && (assumed_vf > 1 || peeling_gap != 0))
    1814              :         {
    1815          655 :           if (dump_enabled_p ())
    1816          159 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1817              :                              "not vectorized: loop only has a single "
    1818              :                              "scalar iteration.\n");
    1819          655 :           return 0;
    1820              :         }
    1821              : 
    1822        68927 :       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    1823              :         {
    1824              :           /* Check that the loop processes at least one full vector.  */
    1825        68916 :           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    1826        68916 :           if (known_lt (scalar_niters, vf))
    1827              :             {
    1828          348 :               if (dump_enabled_p ())
    1829          296 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1830              :                                  "loop does not have enough iterations "
    1831              :                                  "to support vectorization.\n");
    1832          388 :               return 0;
    1833              :             }
    1834              : 
    1835              :           /* If we need to peel an extra epilogue iteration to handle data
    1836              :              accesses with gaps, check that there are enough scalar iterations
    1837              :              available.
    1838              : 
    1839              :              The check above is redundant with this one when peeling for gaps,
    1840              :              but the distinction is useful for diagnostics.  */
    1841        68568 :           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
    1842        68874 :               && known_le (scalar_niters, vf))
    1843              :             {
    1844           40 :               if (dump_enabled_p ())
    1845            9 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1846              :                                  "loop does not have enough iterations "
    1847              :                                  "to support peeling for gaps.\n");
    1848           40 :               return 0;
    1849              :             }
    1850              :         }
    1851        69582 :     }
    1852              : 
    1853              :   /* If using the "very cheap" model. reject cases in which we'd keep
    1854              :      a copy of the scalar code (even if we might be able to vectorize it).  */
    1855       142385 :   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
    1856       142385 :       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
    1857        75438 :           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
    1858              :     {
    1859          721 :       if (dump_enabled_p ())
    1860            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1861              :                          "some scalar iterations would need to be peeled\n");
    1862          721 :       return 0;
    1863              :     }
    1864              : 
    1865       141664 :   int min_profitable_iters, min_profitable_estimate;
    1866       141664 :   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
    1867              :                                       &min_profitable_estimate,
    1868              :                                       suggested_unroll_factor);
    1869              : 
    1870       141664 :   if (min_profitable_iters < 0)
    1871              :     {
    1872        23820 :       if (dump_enabled_p ())
    1873           30 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1874              :                          "not vectorized: vectorization not profitable.\n");
    1875        23820 :       if (dump_enabled_p ())
    1876           30 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1877              :                          "not vectorized: vector version will never be "
    1878              :                          "profitable.\n");
    1879        23820 :       return -1;
    1880              :     }
    1881              : 
    1882       117844 :   int min_scalar_loop_bound = (param_min_vect_loop_bound
    1883       117844 :                                * assumed_vf);
    1884              : 
    1885              :   /* Use the cost model only if it is more conservative than user specified
    1886              :      threshold.  */
    1887       117844 :   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
    1888              :                                     min_profitable_iters);
    1889              : 
    1890       117844 :   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
    1891              : 
    1892        63442 :   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
    1893       181286 :       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
    1894              :     {
    1895          436 :       if (dump_enabled_p ())
    1896            1 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1897              :                          "not vectorized: vectorization not profitable.\n");
    1898          436 :       if (dump_enabled_p ())
    1899            1 :         dump_printf_loc (MSG_NOTE, vect_location,
    1900              :                          "not vectorized: iteration count smaller than user "
    1901              :                          "specified loop bound parameter or minimum profitable "
    1902              :                          "iterations (whichever is more conservative).\n");
    1903          436 :       return 0;
    1904              :     }
    1905              : 
    1906              :   /* The static profitablity threshold min_profitable_estimate includes
    1907              :      the cost of having to check at runtime whether the scalar loop
    1908              :      should be used instead.  If it turns out that we don't need or want
    1909              :      such a check, the threshold we should use for the static estimate
    1910              :      is simply the point at which the vector loop becomes more profitable
    1911              :      than the scalar loop.  */
    1912       117408 :   if (min_profitable_estimate > min_profitable_iters
    1913        25133 :       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
    1914        24578 :       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
    1915          613 :       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
    1916       118021 :       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
    1917              :     {
    1918           12 :       if (dump_enabled_p ())
    1919            7 :         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
    1920              :                          " choice between the scalar and vector loops\n");
    1921           12 :       min_profitable_estimate = min_profitable_iters;
    1922              :     }
    1923              : 
    1924              :   /* If the vector loop needs multiple iterations to be beneficial then
    1925              :      things are probably too close to call, and the conservative thing
    1926              :      would be to stick with the scalar code.  */
    1927       117408 :   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
    1928       117408 :       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
    1929              :     {
    1930        18319 :       if (dump_enabled_p ())
    1931          223 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1932              :                          "one iteration of the vector loop would be"
    1933              :                          " more expensive than the equivalent number of"
    1934              :                          " iterations of the scalar loop\n");
    1935        18319 :       return 0;
    1936              :     }
    1937              : 
    1938        99089 :   HOST_WIDE_INT estimated_niter;
    1939              : 
    1940              :   /* If we are vectorizing an epilogue then we know the maximum number of
    1941              :      scalar iterations it will cover is at least one lower than the
    1942              :      vectorization factor of the main loop.  */
    1943        99089 :   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
    1944        12044 :     estimated_niter
    1945        12044 :       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
    1946              :   else
    1947              :     {
    1948        87045 :       estimated_niter = estimated_stmt_executions_int (loop);
    1949        87045 :       if (estimated_niter == -1)
    1950        31573 :         estimated_niter = likely_max_stmt_executions_int (loop);
    1951              :     }
    1952        43617 :   if (estimated_niter != -1
    1953        96140 :       && ((unsigned HOST_WIDE_INT) estimated_niter
    1954        96140 :           < MAX (th, (unsigned) min_profitable_estimate)))
    1955              :     {
    1956         4297 :       if (dump_enabled_p ())
    1957           32 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1958              :                          "not vectorized: estimated iteration count too "
    1959              :                          "small.\n");
    1960         4297 :       if (dump_enabled_p ())
    1961           32 :         dump_printf_loc (MSG_NOTE, vect_location,
    1962              :                          "not vectorized: estimated iteration count smaller "
    1963              :                          "than specified loop bound parameter or minimum "
    1964              :                          "profitable iterations (whichever is more "
    1965              :                          "conservative).\n");
    1966         4297 :       return -1;
    1967              :     }
    1968              : 
    1969              :   /* As we cannot use a runtime check to gate profitability for uncounted
    1970              :      loops require either an estimate or if none, at least a profitable
    1971              :      vectorization within the first vector iteration (that condition
    1972              :      will practically never be true due to the required epilog and
    1973              :      likely alignment prologue).   */
    1974        94792 :   if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
    1975          163 :       && estimated_niter == -1
    1976        94928 :       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
    1977              :     {
    1978          120 :       if (dump_enabled_p ())
    1979            2 :         dump_printf_loc (MSG_NOTE, vect_location,
    1980              :                          "not vectorized: no loop iteration estimate on the "
    1981              :                          "uncounted loop and not trivially profitable.\n");
    1982          120 :       return -1;
    1983              :     }
    1984              : 
    1985              :   return 1;
    1986              : }
    1987              : 
    1988              : /* Gather data references in LOOP with body BBS and store them into
    1989              :    *DATAREFS.  */
    1990              : 
    1991              : static opt_result
    1992       276153 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
    1993              :                            vec<data_reference_p> *datarefs)
    1994              : {
    1995       826477 :   for (unsigned i = 0; i < loop->num_nodes; i++)
    1996      1224034 :     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    1997      5285394 :          !gsi_end_p (gsi); gsi_next (&gsi))
    1998              :       {
    1999      4735070 :         gimple *stmt = gsi_stmt (gsi);
    2000      4735070 :         if (is_gimple_debug (stmt))
    2001      2234013 :           continue;
    2002      2501187 :         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
    2003              :                                                         NULL, 0);
    2004      2501187 :         if (!res)
    2005              :           {
    2006        61823 :             if (is_gimple_call (stmt) && loop->safelen)
    2007              :               {
    2008          402 :                 tree fndecl = gimple_call_fndecl (stmt), op;
    2009          402 :                 if (fndecl == NULL_TREE
    2010          402 :                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
    2011              :                   {
    2012            0 :                     fndecl = gimple_call_arg (stmt, 0);
    2013            0 :                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
    2014            0 :                     fndecl = TREE_OPERAND (fndecl, 0);
    2015            0 :                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
    2016              :                   }
    2017          402 :                 if (fndecl != NULL_TREE)
    2018              :                   {
    2019          365 :                     cgraph_node *node = cgraph_node::get (fndecl);
    2020          365 :                     if (node != NULL && node->simd_clones != NULL)
    2021              :                       {
    2022          131 :                         unsigned int j, n = gimple_call_num_args (stmt);
    2023          545 :                         for (j = 0; j < n; j++)
    2024              :                           {
    2025          284 :                             op = gimple_call_arg (stmt, j);
    2026          284 :                             if (DECL_P (op)
    2027          284 :                                 || (REFERENCE_CLASS_P (op)
    2028            0 :                                     && get_base_address (op)))
    2029              :                               break;
    2030              :                           }
    2031          131 :                         op = gimple_call_lhs (stmt);
    2032              :                         /* Ignore #pragma omp declare simd functions
    2033              :                            if they don't have data references in the
    2034              :                            call stmt itself.  */
    2035          261 :                         if (j == n
    2036          131 :                             && !(op
    2037          120 :                                  && (DECL_P (op)
    2038          120 :                                      || (REFERENCE_CLASS_P (op)
    2039            0 :                                          && get_base_address (op)))))
    2040          130 :                           continue;
    2041              :                       }
    2042              :                   }
    2043              :               }
    2044        61693 :             return res;
    2045              :           }
    2046              :         /* If dependence analysis will give up due to the limit on the
    2047              :            number of datarefs stop here and fail fatally.  */
    2048      4278784 :         if (datarefs->length ()
    2049      1839420 :             > (unsigned)param_loop_max_datarefs_for_datadeps)
    2050            0 :           return opt_result::failure_at (stmt, "exceeded param "
    2051              :                                          "loop-max-datarefs-for-datadeps\n");
    2052              :       }
    2053       214460 :   return opt_result::success ();
    2054              : }
    2055              : 
    2056              : /* Determine if operating on full vectors for LOOP_VINFO might leave
    2057              :    some scalar iterations still to do.  If so, decide how we should
    2058              :    handle those scalar iterations.  The possibilities are:
    2059              : 
    2060              :    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
    2061              :        In this case:
    2062              : 
    2063              :          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
    2064              :          LOOP_VINFO_PEELING_FOR_NITER == false
    2065              : 
    2066              :    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
    2067              :        to handle the remaining scalar iterations.  In this case:
    2068              : 
    2069              :          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
    2070              :          LOOP_VINFO_PEELING_FOR_NITER == true
    2071              : 
    2072              :    The MASKED_P argument specifies to what extent
    2073              :    param_vect_partial_vector_usage is to be honored.  For MASKED_P == 0
    2074              :    no partial vectors are to be used, for MASKED_P == -1 it's
    2075              :    param_vect_partial_vector_usage that gets to decide whether we may
    2076              :    consider partial vector usage.  For MASKED_P == 1 partial vectors
    2077              :    may be used if possible.
    2078              : 
    2079              :  */
    2080              : 
    2081              : static opt_result
    2082       155269 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
    2083              :                                             int masked_p)
    2084              : {
    2085              :   /* Determine whether there would be any scalar iterations left over.  */
    2086       155269 :   bool need_peeling_or_partial_vectors_p
    2087       155269 :     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
    2088              : 
    2089              :   /* Decide whether to vectorize the loop with partial vectors.  */
    2090       155269 :   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
    2091       155269 :   if (masked_p == 0
    2092       155269 :       || (masked_p == -1 && param_vect_partial_vector_usage == 0))
    2093              :     /* If requested explicitly do not use partial vectors.  */
    2094              :     LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
    2095          207 :   else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
    2096           65 :            && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
    2097            0 :     LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
    2098          207 :   else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
    2099           65 :            && need_peeling_or_partial_vectors_p)
    2100              :     {
    2101              :       /* For partial-vector-usage=1, try to push the handling of partial
    2102              :          vectors to the epilogue, with the main loop continuing to operate
    2103              :          on full vectors.
    2104              : 
    2105              :          If we are unrolling we also do not want to use partial vectors. This
    2106              :          is to avoid the overhead of generating multiple masks and also to
    2107              :          avoid having to execute entire iterations of FALSE masked instructions
    2108              :          when dealing with one or less full iterations.
    2109              : 
    2110              :          ??? We could then end up failing to use partial vectors if we
    2111              :          decide to peel iterations into a prologue, and if the main loop
    2112              :          then ends up processing fewer than VF iterations.  */
    2113           43 :       if ((param_vect_partial_vector_usage == 1
    2114           11 :            || loop_vinfo->suggested_unroll_factor > 1)
    2115           32 :           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
    2116           65 :           && !vect_known_niters_smaller_than_vf (loop_vinfo))
    2117              :         ;
    2118              :       else
    2119           31 :         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
    2120              :     }
    2121              : 
    2122       155269 :   if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
    2123            0 :       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    2124            0 :     return opt_result::failure_at (vect_location,
    2125              :                                    "not vectorized: loop needs but cannot "
    2126              :                                    "use partial vectors\n");
    2127              : 
    2128       155269 :   if (dump_enabled_p ())
    2129        12551 :     dump_printf_loc (MSG_NOTE, vect_location,
    2130              :                      "operating on %s vectors%s.\n",
    2131        12551 :                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
    2132              :                      ? "partial" : "full",
    2133        12551 :                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
    2134              :                      ? " for epilogue loop" : "");
    2135              : 
    2136       155269 :   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
    2137       310538 :     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
    2138       155269 :        && need_peeling_or_partial_vectors_p);
    2139              : 
    2140       155269 :   return opt_result::success ();
    2141              : }
    2142              : 
    2143              : /* Function vect_analyze_loop_2.
    2144              : 
    2145              :    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
    2146              :    analyses will record information in some members of LOOP_VINFO.  FATAL
    2147              :    indicates if some analysis meets fatal error.  If one non-NULL pointer
    2148              :    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
    2149              :    worked out suggested unroll factor, while one NULL pointer shows it's
    2150              :    going to apply the suggested unroll factor.
    2151              :    SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
    2152              :    slp was forced when the suggested unroll factor was worked out.  */
    2153              : static opt_result
    2154       574626 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
    2155              :                      unsigned *suggested_unroll_factor,
    2156              :                      bool& single_lane_slp_done_for_suggested_uf)
    2157              : {
    2158       574626 :   opt_result ok = opt_result::success ();
    2159       574626 :   int res;
    2160       574626 :   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
    2161       574626 :   loop_vec_info orig_loop_vinfo = NULL;
    2162              : 
    2163              :   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
    2164              :      loop_vec_info of the first vectorized loop.  */
    2165       574626 :   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
    2166        13842 :     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
    2167              :   else
    2168              :     orig_loop_vinfo = loop_vinfo;
    2169        13842 :   gcc_assert (orig_loop_vinfo);
    2170              : 
    2171              :   /* We can't mask on niters for uncounted loops due to unknown upper bound.  */
    2172       574626 :   if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
    2173        84101 :     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    2174              : 
    2175              :   /* The first group of checks is independent of the vector size.  */
    2176       574626 :   fatal = true;
    2177              : 
    2178       574626 :   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
    2179       574626 :       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
    2180            5 :     return opt_result::failure_at (vect_location,
    2181              :                                    "not vectorized: simd if(0)\n");
    2182              : 
    2183              :   /* Find all data references in the loop (which correspond to vdefs/vuses)
    2184              :      and analyze their evolution in the loop.  */
    2185              : 
    2186       574621 :   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
    2187              : 
    2188              :   /* Gather the data references.  */
    2189       574621 :   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
    2190              :     {
    2191       276153 :       opt_result res
    2192       276153 :         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
    2193              :                                      &LOOP_VINFO_DATAREFS (loop_vinfo));
    2194       276153 :       if (!res)
    2195              :         {
    2196        61693 :           if (dump_enabled_p ())
    2197         1630 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2198              :                              "not vectorized: loop contains function "
    2199              :                              "calls or data references that cannot "
    2200              :                              "be analyzed\n");
    2201        61693 :           return res;
    2202              :         }
    2203       214460 :       loop_vinfo->shared->save_datarefs ();
    2204              :     }
    2205              :   else
    2206       298468 :     loop_vinfo->shared->check_datarefs ();
    2207              : 
    2208              :   /* Analyze the data references and also adjust the minimal
    2209              :      vectorization factor according to the loads and stores.  */
    2210              : 
    2211       512928 :   ok = vect_analyze_data_refs (loop_vinfo, &fatal);
    2212       512928 :   if (!ok)
    2213              :     {
    2214        72476 :       if (dump_enabled_p ())
    2215         1231 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2216              :                          "bad data references.\n");
    2217        72476 :       return ok;
    2218              :     }
    2219              : 
    2220              :   /* Check if we are applying unroll factor now.  */
    2221       440452 :   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
    2222       440452 :   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
    2223              : 
    2224              :   /* When single-lane SLP was forced and we are applying suggested unroll
    2225              :      factor, keep that decision here.  */
    2226       880904 :   bool force_single_lane = (applying_suggested_uf
    2227       440452 :                             && single_lane_slp_done_for_suggested_uf);
    2228              : 
    2229              :   /* Classify all cross-iteration scalar data-flow cycles.
    2230              :      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
    2231       440452 :   vect_analyze_scalar_cycles (loop_vinfo);
    2232              : 
    2233       440452 :   vect_pattern_recog (loop_vinfo);
    2234              : 
    2235              :   /* Analyze the access patterns of the data-refs in the loop (consecutive,
    2236              :      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
    2237              : 
    2238       440452 :   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
    2239       440452 :   if (!ok)
    2240              :     {
    2241         8010 :       if (dump_enabled_p ())
    2242          292 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2243              :                          "bad data access.\n");
    2244         8010 :       return ok;
    2245              :     }
    2246              : 
    2247              :   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
    2248              : 
    2249       432442 :   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
    2250       432442 :   if (!ok)
    2251              :     {
    2252        45036 :       if (dump_enabled_p ())
    2253          398 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2254              :                          "unexpected pattern.\n");
    2255        45036 :       return ok;
    2256              :     }
    2257              : 
    2258              :   /* While the rest of the analysis below depends on it in some way.  */
    2259       387406 :   fatal = false;
    2260              : 
    2261              :   /* Analyze data dependences between the data-refs in the loop
    2262              :      and adjust the maximum vectorization factor according to
    2263              :      the dependences.
    2264              :      FORNOW: fail at the first data dependence that we encounter.  */
    2265              : 
    2266       387406 :   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
    2267       387406 :   if (!ok)
    2268              :     {
    2269        23790 :       if (dump_enabled_p ())
    2270          538 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2271              :                          "bad data dependence.\n");
    2272        23790 :       return ok;
    2273              :     }
    2274       363616 :   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
    2275              : 
    2276              :   /* Compute the scalar iteration cost.  */
    2277       363616 :   vect_compute_single_scalar_iteration_cost (loop_vinfo);
    2278              : 
    2279       363616 :   bool saved_can_use_partial_vectors_p
    2280              :     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
    2281              : 
    2282              :   /* This is the point where we can re-start analysis with single-lane
    2283              :      SLP forced.  */
    2284       498511 : start_over:
    2285              : 
    2286              :   /* Check the SLP opportunities in the loop, analyze and build
    2287              :      SLP trees.  */
    2288       997022 :   ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
    2289              :                          force_single_lane);
    2290       498511 :   if (!ok)
    2291        24865 :     return ok;
    2292              : 
    2293              :   /* If there are any SLP instances mark them as pure_slp and compute
    2294              :      the overall vectorization factor.  */
    2295       473646 :   if (!vect_make_slp_decision (loop_vinfo))
    2296        61279 :     return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
    2297              : 
    2298       412367 :   if (dump_enabled_p ())
    2299        19116 :     dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
    2300              : 
    2301              :   /* Dump the vectorization factor from the SLP decision.  */
    2302       412367 :   if (dump_enabled_p ())
    2303              :     {
    2304        19116 :       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
    2305        19116 :       dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
    2306        19116 :       dump_printf (MSG_NOTE, "\n");
    2307              :     }
    2308              : 
    2309              :   /* We don't expect to have to roll back to anything other than an empty
    2310              :      set of rgroups.  */
    2311       412367 :   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
    2312              : 
    2313              :   /* Apply the suggested unrolling factor, this was determined by the backend
    2314              :      during finish_cost the first time we ran the analysis for this
    2315              :      vector mode.  */
    2316       412367 :   if (applying_suggested_uf)
    2317          456 :     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
    2318              : 
    2319              :   /* Now the vectorization factor is final.  */
    2320       412367 :   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    2321       412367 :   gcc_assert (known_ne (vectorization_factor, 0U));
    2322              : 
    2323              :   /* Optimize the SLP graph with the vectorization factor fixed.  */
    2324       412367 :   vect_optimize_slp (loop_vinfo);
    2325              : 
    2326              :   /* Gather the loads reachable from the SLP graph entries.  */
    2327       412367 :   vect_gather_slp_loads (loop_vinfo);
    2328              : 
    2329       412367 :   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
    2330              :     {
    2331        14203 :       dump_printf_loc (MSG_NOTE, vect_location,
    2332              :                        "vectorization_factor = ");
    2333        14203 :       dump_dec (MSG_NOTE, vectorization_factor);
    2334        14203 :       dump_printf (MSG_NOTE, ", niters = %wd\n",
    2335        14203 :                    LOOP_VINFO_INT_NITERS (loop_vinfo));
    2336              :     }
    2337              : 
    2338       412367 :   if (max_vf != MAX_VECTORIZATION_FACTOR
    2339       412367 :       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
    2340           41 :     return opt_result::failure_at (vect_location, "bad data dependence.\n");
    2341              : 
    2342       412326 :   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
    2343              : 
    2344              :   /* Analyze the alignment of the data-refs in the loop.  */
    2345       412326 :   vect_analyze_data_refs_alignment (loop_vinfo);
    2346              : 
    2347              :   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
    2348              :      It is important to call pruning after vect_analyze_data_ref_accesses,
    2349              :      since we use grouping information gathered by interleaving analysis.  */
    2350       412326 :   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
    2351       412326 :   if (!ok)
    2352        16739 :     return ok;
    2353              : 
    2354              :   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
    2355              :      vectorization, since we do not want to add extra peeling or
    2356              :      add versioning for alignment.  */
    2357       395587 :   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
    2358              :     /* This pass will decide on using loop versioning and/or loop peeling in
    2359              :        order to enhance the alignment of data references in the loop.  */
    2360       380940 :     ok = vect_enhance_data_refs_alignment (loop_vinfo);
    2361       395587 :   if (!ok)
    2362            0 :     return ok;
    2363              : 
    2364              :   /* Analyze operations in the SLP instances.  We can't simply
    2365              :      remove unsupported SLP instances as this makes the above
    2366              :      SLP kind detection invalid and might also affect the VF.  */
    2367       395587 :   if (! vect_slp_analyze_operations (loop_vinfo))
    2368              :     {
    2369       240318 :       ok = opt_result::failure_at (vect_location,
    2370              :                                    "unsupported SLP instances\n");
    2371       240318 :       goto again;
    2372              :     }
    2373              : 
    2374              :   /* For now, we don't expect to mix both masking and length approaches for one
    2375              :      loop, disable it if both are recorded.  */
    2376       155269 :   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
    2377        23399 :       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
    2378       178662 :       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
    2379              :     {
    2380            0 :       if (dump_enabled_p ())
    2381            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2382              :                          "can't vectorize a loop with partial vectors"
    2383              :                          " because we don't expect to mix different"
    2384              :                          " approaches with partial vectors for the"
    2385              :                          " same loop.\n");
    2386            0 :       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    2387              :     }
    2388              : 
    2389              :   /* If we still have the option of using partial vectors,
    2390              :      check whether we can generate the necessary loop controls.  */
    2391       155269 :   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
    2392              :     {
    2393        23399 :       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
    2394              :         {
    2395        23393 :           if (!vect_verify_full_masking (loop_vinfo)
    2396        23393 :               && !vect_verify_full_masking_avx512 (loop_vinfo))
    2397         6063 :             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    2398              :         }
    2399              :       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
    2400            6 :         if (!vect_verify_loop_lens (loop_vinfo))
    2401            6 :           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    2402              :     }
    2403              : 
    2404              :   /* Decide whether this loop_vinfo should use partial vectors or peeling,
    2405              :      assuming that the loop will be used as a main loop.  We will redo
    2406              :      this analysis later if we instead decide to use the loop as an
    2407              :      epilogue loop.  */
    2408       155269 :   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
    2409       155269 :   if (!ok)
    2410            0 :     return ok;
    2411              : 
    2412              :   /* If we're vectorizing a loop that uses length "controls" and
    2413              :      can iterate more than once, we apply decrementing IV approach
    2414              :      in loop control.  */
    2415       155269 :   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
    2416           31 :       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
    2417            0 :       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
    2418       155269 :       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
    2419            0 :            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
    2420              :                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
    2421            0 :     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
    2422              : 
    2423              :   /* If a loop uses length controls and has a decrementing loop control IV,
    2424              :      we will normally pass that IV through a MIN_EXPR to calcaluate the
    2425              :      basis for the length controls.  E.g. in a loop that processes one
    2426              :      element per scalar iteration, the number of elements would be
    2427              :      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
    2428              : 
    2429              :      This MIN_EXPR approach allows us to use pointer IVs with an invariant
    2430              :      step, since only the final iteration of the vector loop can have
    2431              :      inactive lanes.
    2432              : 
    2433              :      However, some targets have a dedicated instruction for calculating the
    2434              :      preferred length, given the total number of elements that still need to
    2435              :      be processed.  This is encapsulated in the SELECT_VL internal function.
    2436              : 
    2437              :      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
    2438              :      to determine the basis for the length controls.  However, unlike the
    2439              :      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
    2440              :      lanes inactive in any iteration of the vector loop, not just the last
    2441              :      iteration.  This SELECT_VL approach therefore requires us to use pointer
    2442              :      IVs with variable steps.
    2443              : 
    2444              :      Once we've decided how many elements should be processed by one
    2445              :      iteration of the vector loop, we need to populate the rgroup controls.
    2446              :      If a loop has multiple rgroups, we need to make sure that those rgroups
    2447              :      "line up" (that is, they must be consistent about which elements are
    2448              :      active and which aren't).  This is done by vect_adjust_loop_lens_control.
    2449              : 
    2450              :      In principle, it would be possible to use vect_adjust_loop_lens_control
    2451              :      on either the result of a MIN_EXPR or the result of a SELECT_VL.
    2452              :      However:
    2453              : 
    2454              :      (1) In practice, it only makes sense to use SELECT_VL when a vector
    2455              :          operation will be controlled directly by the result.  It is not
    2456              :          worth using SELECT_VL if it would only be the input to other
    2457              :          calculations.
    2458              : 
    2459              :      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
    2460              :          pointer IV will need N updates by a variable amount (N-1 updates
    2461              :          within the iteration and 1 update to move to the next iteration).
    2462              : 
    2463              :      Because of this, we prefer to use the MIN_EXPR approach whenever there
    2464              :      is more than one length control.
    2465              : 
    2466              :      In addition, SELECT_VL always operates to a granularity of 1 unit.
    2467              :      If we wanted to use it to control an SLP operation on N consecutive
    2468              :      elements, we would need to make the SELECT_VL inputs measure scalar
    2469              :      iterations (rather than elements) and then multiply the SELECT_VL
    2470              :      result by N.  But using SELECT_VL this way is inefficient because
    2471              :      of (1) above.
    2472              : 
    2473              :      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
    2474              :         satisfied:
    2475              : 
    2476              :      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
    2477              :      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
    2478              : 
    2479              :      Since SELECT_VL (variable step) will make SCEV analysis failed and then
    2480              :      we will fail to gain benefits of following unroll optimizations. We prefer
    2481              :      using the MIN_EXPR approach in this situation.  */
    2482       155269 :   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
    2483              :     {
    2484            0 :       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
    2485            0 :       if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
    2486            0 :           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
    2487            0 :           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
    2488              :               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
    2489            0 :         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
    2490              : 
    2491            0 :       if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
    2492            0 :         for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
    2493            0 :           if (rgc.type
    2494            0 :               && !direct_internal_fn_supported_p (IFN_SELECT_VL,
    2495              :                                                   rgc.type, iv_type,
    2496              :                                                   OPTIMIZE_FOR_SPEED))
    2497              :             {
    2498            0 :               LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
    2499            0 :               break;
    2500              :             }
    2501              : 
    2502              :       /* If any of the SLP instances cover more than a single lane
    2503              :          we cannot use .SELECT_VL at the moment, even if the number
    2504              :          of lanes is uniform throughout the SLP graph.  */
    2505            0 :       if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
    2506            0 :         for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
    2507            0 :           if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
    2508            0 :               && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
    2509            0 :                    && SLP_INSTANCE_TREE (inst)->ldst_lanes))
    2510              :             {
    2511            0 :               LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
    2512            0 :               break;
    2513              :             }
    2514              :     }
    2515              : 
    2516              :   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
    2517              :      to be able to handle fewer than VF scalars, or needs to have a lower VF
    2518              :      than the main loop.  */
    2519       155269 :   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
    2520        13440 :       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    2521              :     {
    2522        13426 :       poly_uint64 unscaled_vf
    2523        13426 :         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
    2524              :                      orig_loop_vinfo->suggested_unroll_factor);
    2525        13426 :       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
    2526          379 :         return opt_result::failure_at (vect_location,
    2527              :                                        "Vectorization factor too high for"
    2528              :                                        " epilogue loop.\n");
    2529              :     }
    2530              : 
    2531              :   /* If the epilogue needs peeling for gaps but the main loop doesn't give
    2532              :      up on the epilogue.  */
    2533       154890 :   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
    2534        13061 :       && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
    2535           73 :       && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
    2536              :           != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
    2537            4 :     return opt_result::failure_at (vect_location,
    2538              :                                    "Epilogue loop requires peeling for gaps "
    2539              :                                    "but main loop does not.\n");
    2540              : 
    2541              :   /* If an epilogue loop is required make sure we can create one.  */
    2542       154886 :   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
    2543       153609 :       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
    2544        56587 :       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
    2545              :     {
    2546        99814 :       if (dump_enabled_p ())
    2547         5552 :         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
    2548        99814 :       if (!vect_can_advance_ivs_p (loop_vinfo)
    2549       199096 :           || !slpeel_can_duplicate_loop_p (loop,
    2550              :                                            LOOP_VINFO_MAIN_EXIT (loop_vinfo),
    2551        99282 :                                            LOOP_VINFO_MAIN_EXIT (loop_vinfo)))
    2552              :         {
    2553          532 :           ok = opt_result::failure_at (vect_location,
    2554              :                                        "not vectorized: can't create required "
    2555              :                                        "epilog loop\n");
    2556          532 :           goto again;
    2557              :         }
    2558              :     }
    2559              : 
    2560              :   /* Check the costings of the loop make vectorizing worthwhile.  */
    2561       154354 :   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
    2562       154354 :   if (res < 0 && !param_vect_allow_possibly_not_worthwhile_vectorizations)
    2563              :     {
    2564        28237 :       ok = opt_result::failure_at (vect_location,
    2565              :                                    "Loop costings may not be worthwhile.\n");
    2566        28237 :       goto again;
    2567              :     }
    2568       126117 :   if (!res)
    2569        31445 :     return opt_result::failure_at (vect_location,
    2570              :                                    "Loop costings not worthwhile.\n");
    2571              : 
    2572              :   /* During peeling, we need to check if number of loop iterations is
    2573              :      enough for both peeled prolog loop and vector loop.  This check
    2574              :      can be merged along with threshold check of loop versioning, so
    2575              :      increase threshold for this case if necessary.
    2576              : 
    2577              :      If we are analyzing an epilogue we still want to check what its
    2578              :      versioning threshold would be.  If we decide to vectorize the epilogues we
    2579              :      will want to use the lowest versioning threshold of all epilogues and main
    2580              :      loop.  This will enable us to enter a vectorized epilogue even when
    2581              :      versioning the loop.  We can't simply check whether the epilogue requires
    2582              :      versioning though since we may have skipped some versioning checks when
    2583              :      analyzing the epilogue.  For instance, checks for alias versioning will be
    2584              :      skipped when dealing with epilogues as we assume we already checked them
    2585              :      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
    2586        94672 :   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
    2587              :     {
    2588         8949 :       poly_uint64 niters_th = 0;
    2589         8949 :       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
    2590              : 
    2591         8949 :       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
    2592              :         {
    2593              :           /* Niters for peeled prolog loop.  */
    2594         8949 :           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
    2595              :             {
    2596          115 :               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
    2597          115 :               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
    2598          115 :               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
    2599              :             }
    2600              :           else
    2601         8834 :             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
    2602              :         }
    2603              : 
    2604              :       /* Niters for at least one iteration of vectorized loop.  */
    2605         8949 :       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    2606         8945 :         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    2607              :       /* One additional iteration because of peeling for gap.  */
    2608         8949 :       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
    2609           66 :         niters_th += 1;
    2610              : 
    2611              :       /*  Use the same condition as vect_transform_loop to decide when to use
    2612              :           the cost to determine a versioning threshold.  */
    2613         8949 :       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
    2614         8949 :           && ordered_p (th, niters_th))
    2615         6631 :         niters_th = ordered_max (poly_uint64 (th), niters_th);
    2616              : 
    2617         8949 :       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
    2618              :     }
    2619              : 
    2620        94672 :   gcc_assert (known_eq (vectorization_factor,
    2621              :                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
    2622              : 
    2623        94672 :   single_lane_slp_done_for_suggested_uf = force_single_lane;
    2624              : 
    2625              :   /* Ok to vectorize!  */
    2626        94672 :   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
    2627        94672 :   return opt_result::success ();
    2628              : 
    2629       269087 : again:
    2630              :   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
    2631       269087 :   gcc_assert (!ok);
    2632              : 
    2633              :   /* Try again with single-lane SLP.  */
    2634       269087 :   if (force_single_lane)
    2635       133230 :     return ok;
    2636              : 
    2637              :   /* If we are applying suggested unroll factor, we don't need to
    2638              :      re-try any more as we want to keep the SLP mode fixed.  */
    2639       135857 :   if (applying_suggested_uf)
    2640           10 :     return ok;
    2641              : 
    2642              :   /* Likewise if the grouped loads or stores in the SLP cannot be handled
    2643              :      via interleaving or lane instructions.  */
    2644              :   slp_instance instance;
    2645              :   slp_tree node;
    2646              :   unsigned i, j;
    2647       368637 :   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
    2648              :     {
    2649       233742 :       if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
    2650            0 :         continue;
    2651              : 
    2652       233742 :       stmt_vec_info vinfo;
    2653       233742 :       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
    2654       233742 :       if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
    2655       230980 :         continue;
    2656         2762 :       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
    2657         2762 :       unsigned int size = DR_GROUP_SIZE (vinfo);
    2658         2762 :       tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
    2659         2762 :       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
    2660         4849 :          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
    2661         5518 :          && ! vect_grouped_store_supported (vectype, size))
    2662          669 :         return opt_result::failure_at (vinfo->stmt,
    2663              :                                        "unsupported grouped store\n");
    2664       236308 :       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
    2665              :         {
    2666         2238 :           vinfo = SLP_TREE_REPRESENTATIVE (node);
    2667         2238 :           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
    2668              :             {
    2669         1957 :               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
    2670         1957 :               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
    2671         1957 :               size = DR_GROUP_SIZE (vinfo);
    2672         1957 :               vectype = SLP_TREE_VECTYPE (node);
    2673         1957 :               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
    2674         1957 :                   && ! vect_grouped_load_supported (vectype, single_element_p,
    2675              :                                                     size))
    2676          283 :                 return opt_result::failure_at (vinfo->stmt,
    2677              :                                                "unsupported grouped load\n");
    2678              :             }
    2679              :         }
    2680              :     }
    2681              : 
    2682              :   /* Roll back state appropriately.  Force single-lane SLP this time.  */
    2683       134895 :   force_single_lane = true;
    2684       134895 :   if (dump_enabled_p ())
    2685         3381 :     dump_printf_loc (MSG_NOTE, vect_location,
    2686              :                      "re-trying with single-lane SLP\n");
    2687              : 
    2688              :   /* Reset the vectorization factor.  */
    2689       134895 :   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
    2690              :   /* Free the SLP instances.  */
    2691       367678 :   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
    2692       232783 :     vect_free_slp_instance (instance);
    2693       134895 :   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
    2694              :   /* Reset altered state on stmts.  */
    2695       512633 :   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
    2696              :     {
    2697       377738 :       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
    2698       377738 :       for (gimple_stmt_iterator si = gsi_start_phis (bb);
    2699       679514 :            !gsi_end_p (si); gsi_next (&si))
    2700              :         {
    2701       301776 :           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
    2702       301776 :           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
    2703       301776 :               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    2704              :             {
    2705              :               /* vectorizable_reduction adjusts reduction stmt def-types,
    2706              :                  restore them to that of the PHI.  */
    2707        25668 :               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
    2708        25668 :                 = STMT_VINFO_DEF_TYPE (stmt_info);
    2709        25668 :               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
    2710              :                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
    2711        25668 :                 = STMT_VINFO_DEF_TYPE (stmt_info);
    2712              :             }
    2713              :         }
    2714              :     }
    2715              :   /* Free optimized alias test DDRS.  */
    2716       134895 :   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
    2717       134895 :   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
    2718       134895 :   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
    2719              :   /* Reset target cost data.  */
    2720       134895 :   delete loop_vinfo->vector_costs;
    2721       134895 :   loop_vinfo->vector_costs = nullptr;
    2722              :   /* Reset accumulated rgroup information.  */
    2723       134895 :   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
    2724       134895 :   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
    2725       134895 :   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
    2726              :   /* Reset assorted flags.  */
    2727       134895 :   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
    2728       134895 :   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
    2729       134895 :   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
    2730       134895 :   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
    2731       134895 :   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
    2732       134895 :     = saved_can_use_partial_vectors_p;
    2733       134895 :   LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    2734       134895 :   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
    2735       134895 :   LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
    2736       134895 :   LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = false;
    2737              : 
    2738       134895 :   if (loop_vinfo->scan_map)
    2739          122 :     loop_vinfo->scan_map->empty ();
    2740              : 
    2741       134895 :   goto start_over;
    2742              : }
    2743              : 
    2744              : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
    2745              :    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
    2746              :    OLD_LOOP_VINFO is better unless something specifically indicates
    2747              :    otherwise.
    2748              : 
    2749              :    Note that this deliberately isn't a partial order.  */
    2750              : 
    2751              : static bool
    2752        32461 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
    2753              :                           loop_vec_info old_loop_vinfo)
    2754              : {
    2755        32461 :   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
    2756        32461 :   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
    2757              : 
    2758        32461 :   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
    2759        32461 :   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
    2760              : 
    2761              :   /* Always prefer a VF of loop->simdlen over any other VF.  */
    2762        32461 :   if (loop->simdlen)
    2763              :     {
    2764            0 :       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
    2765            0 :       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
    2766            0 :       if (new_simdlen_p != old_simdlen_p)
    2767              :         return new_simdlen_p;
    2768              :     }
    2769              : 
    2770        32461 :   const auto *old_costs = old_loop_vinfo->vector_costs;
    2771        32461 :   const auto *new_costs = new_loop_vinfo->vector_costs;
    2772        32461 :   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
    2773         1482 :     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
    2774              : 
    2775        30979 :   return new_costs->better_main_loop_than_p (old_costs);
    2776              : }
    2777              : 
    2778              : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
    2779              :    true if we should.  */
    2780              : 
    2781              : static bool
    2782        32461 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
    2783              :                         loop_vec_info old_loop_vinfo)
    2784              : {
    2785        32461 :   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
    2786              :     return false;
    2787              : 
    2788         1348 :   if (dump_enabled_p ())
    2789           12 :     dump_printf_loc (MSG_NOTE, vect_location,
    2790              :                      "***** Preferring vector mode %s to vector mode %s\n",
    2791           12 :                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
    2792           12 :                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
    2793              :   return true;
    2794              : }
    2795              : 
    2796              : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
    2797              :    not NULL.  When MASKED_P is not -1 override the default
    2798              :    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
    2799              :    Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
    2800              :    mode useful to analyze.
    2801              :    Return the loop_vinfo on success and wrapped null on failure.  */
    2802              : 
    2803              : static opt_loop_vec_info
    2804       574170 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
    2805              :                      const vect_loop_form_info *loop_form_info,
    2806              :                      loop_vec_info orig_loop_vinfo,
    2807              :                      const vector_modes &vector_modes, unsigned &mode_i,
    2808              :                      int masked_p,
    2809              :                      machine_mode &autodetected_vector_mode,
    2810              :                      bool &fatal)
    2811              : {
    2812       574170 :   loop_vec_info loop_vinfo
    2813       574170 :     = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
    2814              : 
    2815       574170 :   machine_mode vector_mode = vector_modes[mode_i];
    2816       574170 :   loop_vinfo->vector_mode = vector_mode;
    2817       574170 :   unsigned int suggested_unroll_factor = 1;
    2818       574170 :   bool single_lane_slp_done_for_suggested_uf = false;
    2819              : 
    2820              :   /* Run the main analysis.  */
    2821       574170 :   opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
    2822              :                                         &suggested_unroll_factor,
    2823              :                                         single_lane_slp_done_for_suggested_uf);
    2824       574170 :   if (dump_enabled_p ())
    2825        21257 :     dump_printf_loc (MSG_NOTE, vect_location,
    2826              :                      "***** Analysis %s with vector mode %s\n",
    2827        21257 :                      res ? "succeeded" : "failed",
    2828        21257 :                      GET_MODE_NAME (loop_vinfo->vector_mode));
    2829              : 
    2830       574170 :   auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
    2831       574170 :   if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
    2832              :       /* Check to see if the user wants to unroll or if the target wants to.  */
    2833       660116 :       && (suggested_unroll_factor > 1 || user_unroll > 1))
    2834              :     {
    2835          482 :       if (suggested_unroll_factor == 1)
    2836              :         {
    2837           66 :           int assumed_vf = vect_vf_for_cost (loop_vinfo);
    2838           66 :           suggested_unroll_factor = user_unroll / assumed_vf;
    2839           66 :           if (suggested_unroll_factor > 1)
    2840              :             {
    2841           40 :               if (dump_enabled_p ())
    2842           20 :                 dump_printf_loc (MSG_NOTE, vect_location,
    2843              :                          "setting unroll factor to %d based on user requested "
    2844              :                          "unroll factor %d and suggested vectorization "
    2845              :                          "factor: %d\n",
    2846              :                          suggested_unroll_factor, user_unroll, assumed_vf);
    2847              :             }
    2848              :         }
    2849              : 
    2850          482 :         if (suggested_unroll_factor > 1)
    2851              :           {
    2852          456 :             if (dump_enabled_p ())
    2853           62 :               dump_printf_loc (MSG_NOTE, vect_location,
    2854              :                          "***** Re-trying analysis for unrolling"
    2855              :                          " with unroll factor %d and %s slp.\n",
    2856              :                          suggested_unroll_factor,
    2857              :                          single_lane_slp_done_for_suggested_uf
    2858              :                          ? "single-lane" : "");
    2859          456 :             loop_vec_info unroll_vinfo
    2860          456 :                 = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
    2861          456 :             unroll_vinfo->vector_mode = vector_mode;
    2862          456 :             unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
    2863          456 :             opt_result new_res
    2864          456 :               = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
    2865              :                                      single_lane_slp_done_for_suggested_uf);
    2866          456 :             if (new_res)
    2867              :               {
    2868          397 :                 delete loop_vinfo;
    2869          397 :                 loop_vinfo = unroll_vinfo;
    2870              :               }
    2871              :             else
    2872           59 :               delete unroll_vinfo;
    2873              :           }
    2874              : 
    2875              :         /* Record that we have honored a user unroll factor.  */
    2876          482 :         LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
    2877              :     }
    2878              : 
    2879              :   /* Remember the autodetected vector mode.  */
    2880       574170 :   if (vector_mode == VOIDmode)
    2881       266495 :     autodetected_vector_mode = loop_vinfo->vector_mode;
    2882              : 
    2883              :   /* Advance mode_i, first skipping modes that would result in the
    2884              :      same analysis result.  */
    2885      2533746 :   while (mode_i + 1 < vector_modes.length ()
    2886      1743491 :          && vect_chooses_same_modes_p (loop_vinfo,
    2887       763703 :                                        vector_modes[mode_i + 1]))
    2888              :     {
    2889       405618 :       if (dump_enabled_p ())
    2890        17055 :         dump_printf_loc (MSG_NOTE, vect_location,
    2891              :                          "***** The result for vector mode %s would"
    2892              :                          " be the same\n",
    2893        17055 :                          GET_MODE_NAME (vector_modes[mode_i + 1]));
    2894       405618 :       mode_i += 1;
    2895              :     }
    2896       574170 :   if (mode_i + 1 < vector_modes.length ()
    2897       932255 :       && vect_chooses_same_modes_p (autodetected_vector_mode,
    2898       358085 :                                     vector_modes[mode_i + 1]))
    2899              :     {
    2900          420 :       if (dump_enabled_p ())
    2901           11 :         dump_printf_loc (MSG_NOTE, vect_location,
    2902              :                          "***** Skipping vector mode %s, which would"
    2903              :                          " repeat the analysis for %s\n",
    2904           11 :                          GET_MODE_NAME (vector_modes[mode_i + 1]),
    2905           11 :                          GET_MODE_NAME (autodetected_vector_mode));
    2906          420 :       mode_i += 1;
    2907              :     }
    2908       574170 :   mode_i++;
    2909              : 
    2910       574170 :   if (!res)
    2911              :     {
    2912       479895 :       delete loop_vinfo;
    2913       479895 :       if (fatal)
    2914       102457 :         gcc_checking_assert (orig_loop_vinfo == NULL);
    2915       479895 :       return opt_loop_vec_info::propagate_failure (res);
    2916              :     }
    2917              : 
    2918        94275 :   return opt_loop_vec_info::success (loop_vinfo);
    2919              : }
    2920              : 
    2921              : /* Function vect_analyze_loop.
    2922              : 
    2923              :    Apply a set of analyses on LOOP, and create a loop_vec_info struct
    2924              :    for it.  The different analyses will record information in the
    2925              :    loop_vec_info struct.  */
    2926              : opt_loop_vec_info
    2927       464638 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
    2928              :                    vec_info_shared *shared)
    2929              : {
    2930       464638 :   DUMP_VECT_SCOPE ("analyze_loop_nest");
    2931              : 
    2932       464638 :   if (loop_outer (loop)
    2933       464638 :       && loop_vec_info_for_loop (loop_outer (loop))
    2934       465220 :       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
    2935          582 :     return opt_loop_vec_info::failure_at (vect_location,
    2936              :                                           "outer-loop already vectorized.\n");
    2937              : 
    2938       464056 :   if (!find_loop_nest (loop, &shared->loop_nest))
    2939        22302 :     return opt_loop_vec_info::failure_at
    2940        22302 :       (vect_location,
    2941              :        "not vectorized: loop nest containing two or more consecutive inner"
    2942              :        " loops cannot be vectorized\n");
    2943              : 
    2944              :   /* Analyze the loop form.  */
    2945       441754 :   vect_loop_form_info loop_form_info;
    2946       441754 :   opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
    2947              :                                            &loop_form_info);
    2948       441754 :   if (!res)
    2949              :     {
    2950       175259 :       if (dump_enabled_p ())
    2951         1527 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2952              :                          "bad loop form.\n");
    2953       175259 :       return opt_loop_vec_info::propagate_failure (res);
    2954              :     }
    2955       266495 :   if (!integer_onep (loop_form_info.assumptions))
    2956              :     {
    2957              :       /* We consider to vectorize this loop by versioning it under
    2958              :          some assumptions.  In order to do this, we need to clear
    2959              :          existing information computed by scev and niter analyzer.  */
    2960         8366 :       scev_reset_htab ();
    2961         8366 :       free_numbers_of_iterations_estimates (loop);
    2962              :       /* Also set flag for this loop so that following scev and niter
    2963              :          analysis are done under the assumptions.  */
    2964         8366 :       loop_constraint_set (loop, LOOP_C_FINITE);
    2965              :     }
    2966              :   else
    2967              :     /* Clear the existing niter information to make sure the nonwrapping flag
    2968              :        will be calculated and set propriately.  */
    2969       258129 :     free_numbers_of_iterations_estimates (loop);
    2970              : 
    2971       266495 :   auto_vector_modes vector_modes;
    2972              :   /* Autodetect first vector size we try.  */
    2973       266495 :   vector_modes.safe_push (VOIDmode);
    2974       266495 :   unsigned int autovec_flags
    2975       532990 :     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
    2976       266495 :                                                     loop->simdlen != 0);
    2977       266495 :   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
    2978       266495 :                              && !unlimited_cost_model (loop));
    2979       266495 :   machine_mode autodetected_vector_mode = VOIDmode;
    2980       266495 :   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
    2981       266495 :   unsigned int mode_i = 0;
    2982       266495 :   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
    2983              : 
    2984              :   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
    2985              :      a mode has not been analyzed.  */
    2986       266495 :   auto_vec<poly_uint64, 8> cached_vf_per_mode;
    2987      2676650 :   for (unsigned i = 0; i < vector_modes.length (); ++i)
    2988      1071830 :     cached_vf_per_mode.safe_push (0);
    2989              : 
    2990              :   /* First determine the main loop vectorization mode, either the first
    2991              :      one that works, starting with auto-detecting the vector mode and then
    2992              :      following the targets order of preference, or the one with the
    2993              :      lowest cost if pick_lowest_cost_p.  */
    2994       854161 :   while (1)
    2995              :     {
    2996       560328 :       bool fatal;
    2997       560328 :       unsigned int last_mode_i = mode_i;
    2998              :       /* Set cached VF to -1 prior to analysis, which indicates a mode has
    2999              :          failed.  */
    3000       560328 :       cached_vf_per_mode[last_mode_i] = -1;
    3001       560328 :       opt_loop_vec_info loop_vinfo
    3002       560328 :         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
    3003              :                                NULL, vector_modes, mode_i, -1,
    3004              :                                autodetected_vector_mode, fatal);
    3005       560328 :       if (fatal)
    3006              :         break;
    3007              : 
    3008       457871 :       if (loop_vinfo)
    3009              :         {
    3010              :           /*  Analysis has been successful so update the VF value.  The
    3011              :               VF should always be a multiple of unroll_factor and we want to
    3012              :               capture the original VF here.  */
    3013        85946 :           cached_vf_per_mode[last_mode_i]
    3014        85946 :             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
    3015        85946 :                          loop_vinfo->suggested_unroll_factor);
    3016              :           /* Once we hit the desired simdlen for the first time,
    3017              :              discard any previous attempts.  */
    3018        85946 :           if (simdlen
    3019        85946 :               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
    3020              :             {
    3021           47 :               delete first_loop_vinfo;
    3022              :               first_loop_vinfo = opt_loop_vec_info::success (NULL);
    3023              :               simdlen = 0;
    3024              :             }
    3025        85899 :           else if (pick_lowest_cost_p
    3026        71921 :                    && first_loop_vinfo
    3027       116878 :                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
    3028              :             {
    3029              :               /* Pick loop_vinfo over first_loop_vinfo.  */
    3030         1178 :               delete first_loop_vinfo;
    3031         1178 :               first_loop_vinfo = opt_loop_vec_info::success (NULL);
    3032              :             }
    3033        85946 :           if (first_loop_vinfo == NULL)
    3034              :             first_loop_vinfo = loop_vinfo;
    3035              :           else
    3036              :             {
    3037        29803 :               delete loop_vinfo;
    3038        29803 :               loop_vinfo = opt_loop_vec_info::success (NULL);
    3039              :             }
    3040              : 
    3041              :           /* Commit to first_loop_vinfo if we have no reason to try
    3042              :              alternatives.  */
    3043        85946 :           if (!simdlen && !pick_lowest_cost_p)
    3044              :             break;
    3045              :         }
    3046       443855 :       if (mode_i == vector_modes.length ()
    3047       443855 :           || autodetected_vector_mode == VOIDmode)
    3048              :         break;
    3049              : 
    3050              :       /* Try the next biggest vector size.  */
    3051       293833 :       if (dump_enabled_p ())
    3052         4773 :         dump_printf_loc (MSG_NOTE, vect_location,
    3053              :                          "***** Re-trying analysis with vector mode %s\n",
    3054         4773 :                          GET_MODE_NAME (vector_modes[mode_i]));
    3055       293833 :     }
    3056       266495 :   if (!first_loop_vinfo)
    3057       211535 :     return opt_loop_vec_info::propagate_failure (res);
    3058              : 
    3059        54960 :   if (dump_enabled_p ())
    3060         9559 :     dump_printf_loc (MSG_NOTE, vect_location,
    3061              :                      "***** Choosing vector mode %s\n",
    3062         9559 :                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
    3063              : 
    3064              :   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
    3065              :      enabled, SIMDUID is not set, it is the innermost loop and we have
    3066              :      either already found the loop's SIMDLEN or there was no SIMDLEN to
    3067              :      begin with.
    3068              :      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
    3069        54960 :   bool vect_epilogues = (!simdlen
    3070        54958 :                          && loop->inner == NULL
    3071        54362 :                          && param_vect_epilogues_nomask
    3072        53220 :                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
    3073              :                            /* No code motion support for multiple epilogues so for now
    3074              :                               not supported when multiple exits.  */
    3075        26032 :                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
    3076        25568 :                          && !loop->simduid
    3077        79115 :                          && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
    3078        54960 :   if (!vect_epilogues)
    3079        42019 :     return first_loop_vinfo;
    3080              : 
    3081              :   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
    3082              : 
    3083              :   /* For epilogues start the analysis from the first mode.  The motivation
    3084              :      behind starting from the beginning comes from cases where the VECTOR_MODES
    3085              :      array may contain length-agnostic and length-specific modes.  Their
    3086              :      ordering is not guaranteed, so we could end up picking a mode for the main
    3087              :      loop that is after the epilogue's optimal mode.  */
    3088        12941 :   int masked_p = -1;
    3089        12941 :   if (!unlimited_cost_model (loop)
    3090        12941 :       && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
    3091              :           != VOIDmode))
    3092              :     {
    3093            4 :       vector_modes[0]
    3094            4 :         = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
    3095            4 :       cached_vf_per_mode[0] = 0;
    3096              :     }
    3097              :   else
    3098        12937 :     vector_modes[0] = autodetected_vector_mode;
    3099        12941 :   mode_i = 0;
    3100              : 
    3101        12977 :   bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
    3102        12941 :                                    || masked_p == 1);
    3103              :   if (supports_partial_vectors
    3104           36 :       && !partial_vectors_supported_p ()
    3105           36 :       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
    3106              :     supports_partial_vectors = false;
    3107        12941 :   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
    3108              : 
    3109        12941 :   loop_vec_info orig_loop_vinfo = first_loop_vinfo;
    3110        13107 :   do
    3111              :     {
    3112              :       /* Let the user override what the target suggests.  */
    3113        13024 :       if (OPTION_SET_P (param_vect_partial_vector_usage))
    3114           45 :         masked_p = -1;
    3115              : 
    3116        50392 :       while (1)
    3117              :         {
    3118              :           /* If the target does not support partial vectors we can shorten the
    3119              :              number of modes to analyze for the epilogue as we know we can't
    3120              :              pick a mode that would lead to a VF at least as big as the
    3121              :              FIRST_VINFO_VF.  */
    3122        67065 :           if (!supports_partial_vectors
    3123        50392 :               && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
    3124              :             {
    3125        23805 :               mode_i++;
    3126        47610 :               if (mode_i == vector_modes.length ())
    3127              :                 break;
    3128        29418 :               continue;
    3129              :             }
    3130              :           /* We would need an exhaustive search to find all modes we
    3131              :              skipped but that would lead to the same result as the
    3132              :              analysis it was skipped for and where we'd could check
    3133              :              cached_vf_per_mode against.
    3134              :              Check for the autodetected mode, which is the common
    3135              :              situation on x86 which does not perform cost comparison.  */
    3136        39332 :           if (!supports_partial_vectors
    3137        26544 :               && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
    3138        52344 :               && vect_chooses_same_modes_p (autodetected_vector_mode,
    3139        25757 :                                             vector_modes[mode_i]))
    3140              :             {
    3141        12745 :               mode_i++;
    3142        25490 :               if (mode_i == vector_modes.length ())
    3143              :                 break;
    3144        12745 :               continue;
    3145              :             }
    3146              : 
    3147        13842 :           if (dump_enabled_p ())
    3148         3255 :             dump_printf_loc (MSG_NOTE, vect_location,
    3149              :                              "***** Re-trying epilogue analysis with vector "
    3150         3255 :                              "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
    3151              : 
    3152        13842 :           bool fatal;
    3153        13842 :           opt_loop_vec_info loop_vinfo
    3154        13842 :             = vect_analyze_loop_1 (loop, shared, &loop_form_info,
    3155              :                                    orig_loop_vinfo,
    3156              :                                    vector_modes, mode_i, masked_p,
    3157              :                                    autodetected_vector_mode, fatal);
    3158        13842 :           if (fatal)
    3159              :             break;
    3160              : 
    3161        13842 :           if (loop_vinfo)
    3162              :             {
    3163         8329 :               if (pick_lowest_cost_p
    3164         5379 :                   && orig_loop_vinfo->epilogue_vinfo
    3165         9811 :                   && vect_joust_loop_vinfos (loop_vinfo,
    3166         1482 :                                              orig_loop_vinfo->epilogue_vinfo))
    3167              :                 {
    3168          170 :                   gcc_assert (vect_epilogues);
    3169          170 :                   delete orig_loop_vinfo->epilogue_vinfo;
    3170          170 :                   orig_loop_vinfo->epilogue_vinfo = nullptr;
    3171              :                 }
    3172         8329 :               if (!orig_loop_vinfo->epilogue_vinfo)
    3173         7017 :                 orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
    3174              :               else
    3175              :                 {
    3176         1312 :                   delete loop_vinfo;
    3177         1312 :                   loop_vinfo = opt_loop_vec_info::success (NULL);
    3178              :                 }
    3179              : 
    3180              :               /* For now only allow one epilogue loop, but allow
    3181              :                  pick_lowest_cost_p to replace it, so commit to the
    3182              :                  first epilogue if we have no reason to try alternatives.  */
    3183         8329 :               if (!pick_lowest_cost_p)
    3184              :                 break;
    3185              :             }
    3186              : 
    3187              :           /* Revert back to the default from the suggested preferred
    3188              :              epilogue vectorization mode.  */
    3189        10892 :           masked_p = -1;
    3190        21784 :           if (mode_i == vector_modes.length ())
    3191              :             break;
    3192              :         }
    3193              : 
    3194        13024 :       orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
    3195        13024 :       if (!orig_loop_vinfo)
    3196              :         break;
    3197              : 
    3198              :       /* When we selected a first vectorized epilogue, see if the target
    3199              :          suggests to have another one.  */
    3200         6847 :       masked_p = -1;
    3201         6847 :       if (!unlimited_cost_model (loop)
    3202         3903 :           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
    3203        10744 :           && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
    3204              :               != VOIDmode))
    3205              :         {
    3206          166 :           vector_modes[0]
    3207           83 :             = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
    3208           83 :           cached_vf_per_mode[0] = 0;
    3209           83 :           mode_i = 0;
    3210              :         }
    3211              :       else
    3212              :         break;
    3213           83 :     }
    3214              :   while (1);
    3215              : 
    3216        12941 :   if (first_loop_vinfo->epilogue_vinfo)
    3217              :     {
    3218         6772 :       poly_uint64 lowest_th
    3219         6772 :         = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
    3220         6772 :       loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
    3221         6847 :       do
    3222              :         {
    3223         6847 :           poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
    3224         6847 :           gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
    3225              :                       || maybe_ne (lowest_th, 0U));
    3226              :           /* Keep track of the known smallest versioning threshold.  */
    3227         6847 :           if (ordered_p (lowest_th, th))
    3228         6847 :             lowest_th = ordered_min (lowest_th, th);
    3229         6847 :           epilog_vinfo = epilog_vinfo->epilogue_vinfo;
    3230              :         }
    3231         6847 :       while (epilog_vinfo);
    3232         6772 :       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
    3233         6772 :       if (dump_enabled_p ())
    3234         1449 :         dump_printf_loc (MSG_NOTE, vect_location,
    3235              :                          "***** Choosing epilogue vector mode %s\n",
    3236         1449 :                          GET_MODE_NAME
    3237              :                            (first_loop_vinfo->epilogue_vinfo->vector_mode));
    3238              :     }
    3239              : 
    3240        12941 :   return first_loop_vinfo;
    3241       708249 : }
    3242              : 
    3243              : /* Return true if there is an in-order reduction function for CODE, storing
    3244              :    it in *REDUC_FN if so.  */
    3245              : 
    3246              : static bool
    3247         5083 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
    3248              : {
    3249              :   /* We support MINUS_EXPR by negating the operand.  This also preserves an
    3250              :      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
    3251              :      (-0.0) = -0.0.  */
    3252         5083 :   if (code == PLUS_EXPR || code == MINUS_EXPR)
    3253              :     {
    3254         4411 :       *reduc_fn = IFN_FOLD_LEFT_PLUS;
    3255            0 :       return true;
    3256              :     }
    3257              :   return false;
    3258              : }
    3259              : 
    3260              : /* Function reduction_fn_for_scalar_code
    3261              : 
    3262              :    Input:
    3263              :    CODE - tree_code of a reduction operations.
    3264              : 
    3265              :    Output:
    3266              :    REDUC_FN - the corresponding internal function to be used to reduce the
    3267              :       vector of partial results into a single scalar result, or IFN_LAST
    3268              :       if the operation is a supported reduction operation, but does not have
    3269              :       such an internal function.
    3270              : 
    3271              :    Return FALSE if CODE currently cannot be vectorized as reduction.  */
    3272              : 
    3273              : bool
    3274      2030024 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
    3275              : {
    3276      2030024 :   if (code.is_tree_code ())
    3277      2029966 :     switch (tree_code (code))
    3278              :       {
    3279        15328 :       case MAX_EXPR:
    3280        15328 :         *reduc_fn = IFN_REDUC_MAX;
    3281        15328 :         return true;
    3282              : 
    3283        62655 :       case MIN_EXPR:
    3284        62655 :         *reduc_fn = IFN_REDUC_MIN;
    3285        62655 :         return true;
    3286              : 
    3287      1102729 :       case PLUS_EXPR:
    3288      1102729 :         *reduc_fn = IFN_REDUC_PLUS;
    3289      1102729 :         return true;
    3290              : 
    3291       232490 :       case BIT_AND_EXPR:
    3292       232490 :         *reduc_fn = IFN_REDUC_AND;
    3293       232490 :         return true;
    3294              : 
    3295       287494 :       case BIT_IOR_EXPR:
    3296       287494 :         *reduc_fn = IFN_REDUC_IOR;
    3297       287494 :         return true;
    3298              : 
    3299        44295 :       case BIT_XOR_EXPR:
    3300        44295 :         *reduc_fn = IFN_REDUC_XOR;
    3301        44295 :         return true;
    3302              : 
    3303       284975 :       case MULT_EXPR:
    3304       284975 :       case MINUS_EXPR:
    3305       284975 :         *reduc_fn = IFN_LAST;
    3306       284975 :         return true;
    3307              : 
    3308              :       default:
    3309              :         return false;
    3310              :       }
    3311              :   else
    3312           58 :     switch (combined_fn (code))
    3313              :       {
    3314           34 :       CASE_CFN_FMAX:
    3315           34 :         *reduc_fn = IFN_REDUC_FMAX;
    3316           34 :         return true;
    3317              : 
    3318           24 :       CASE_CFN_FMIN:
    3319           24 :         *reduc_fn = IFN_REDUC_FMIN;
    3320           24 :         return true;
    3321              : 
    3322              :       default:
    3323              :         return false;
    3324              :       }
    3325              : }
    3326              : 
    3327              : /* Set *SBOOL_FN to the corresponding function working on vector masks
    3328              :    for REDUC_FN.  Return true if that exists, false otherwise.  */
    3329              : 
    3330              : static bool
    3331            0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
    3332              : {
    3333            0 :   switch (reduc_fn)
    3334              :     {
    3335            0 :     case IFN_REDUC_AND:
    3336            0 :       *sbool_fn = IFN_REDUC_SBOOL_AND;
    3337            0 :       return true;
    3338            0 :     case IFN_REDUC_IOR:
    3339            0 :       *sbool_fn = IFN_REDUC_SBOOL_IOR;
    3340            0 :       return true;
    3341            0 :     case IFN_REDUC_XOR:
    3342            0 :       *sbool_fn = IFN_REDUC_SBOOL_XOR;
    3343            0 :       return true;
    3344              :     default:
    3345              :       return false;
    3346              :     }
    3347              : }
    3348              : 
    3349              : /* If there is a neutral value X such that a reduction would not be affected
    3350              :    by the introduction of additional X elements, return that X, otherwise
    3351              :    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
    3352              :    of the scalar elements.  If the reduction has just a single initial value
    3353              :    then INITIAL_VALUE is that value, otherwise it is null.
    3354              :    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
    3355              :    In that case no signed zero is returned.  */
    3356              : 
    3357              : tree
    3358        77549 : neutral_op_for_reduction (tree scalar_type, code_helper code,
    3359              :                           tree initial_value, bool as_initial)
    3360              : {
    3361        77549 :   if (code.is_tree_code ())
    3362        77491 :     switch (tree_code (code))
    3363              :       {
    3364        13836 :       case DOT_PROD_EXPR:
    3365        13836 :       case SAD_EXPR:
    3366        13836 :       case MINUS_EXPR:
    3367        13836 :       case BIT_IOR_EXPR:
    3368        13836 :       case BIT_XOR_EXPR:
    3369        13836 :         return build_zero_cst (scalar_type);
    3370        57475 :       case WIDEN_SUM_EXPR:
    3371        57475 :       case PLUS_EXPR:
    3372        57475 :         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
    3373          100 :           return build_real (scalar_type, dconstm0);
    3374              :         else
    3375        57375 :           return build_zero_cst (scalar_type);
    3376              : 
    3377         2165 :       case MULT_EXPR:
    3378         2165 :         return build_one_cst (scalar_type);
    3379              : 
    3380         1558 :       case BIT_AND_EXPR:
    3381         1558 :         return build_all_ones_cst (scalar_type);
    3382              : 
    3383              :       case MAX_EXPR:
    3384              :       case MIN_EXPR:
    3385              :         return initial_value;
    3386              : 
    3387          428 :       default:
    3388          428 :         return NULL_TREE;
    3389              :       }
    3390              :   else
    3391           58 :     switch (combined_fn (code))
    3392              :       {
    3393              :       CASE_CFN_FMIN:
    3394              :       CASE_CFN_FMAX:
    3395              :         return initial_value;
    3396              : 
    3397            0 :       default:
    3398            0 :         return NULL_TREE;
    3399              :       }
    3400              : }
    3401              : 
    3402              : /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
    3403              :    STMT is printed with a message MSG. */
    3404              : 
    3405              : static void
    3406          578 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
    3407              : {
    3408          578 :   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
    3409          578 : }
    3410              : 
    3411              : /* Return true if we need an in-order reduction for operation CODE
    3412              :    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
    3413              :    overflow must wrap.  */
    3414              : 
    3415              : bool
    3416      6544927 : needs_fold_left_reduction_p (tree type, code_helper code)
    3417              : {
    3418              :   /* CHECKME: check for !flag_finite_math_only too?  */
    3419      6544927 :   if (SCALAR_FLOAT_TYPE_P (type))
    3420              :     {
    3421       581529 :       if (code.is_tree_code ())
    3422       581475 :         switch (tree_code (code))
    3423              :           {
    3424              :           case MIN_EXPR:
    3425              :           case MAX_EXPR:
    3426              :             return false;
    3427              : 
    3428       579619 :           default:
    3429       579619 :             return !flag_associative_math;
    3430              :           }
    3431              :       else
    3432           54 :         switch (combined_fn (code))
    3433              :           {
    3434              :           CASE_CFN_FMIN:
    3435              :           CASE_CFN_FMAX:
    3436              :             return false;
    3437              : 
    3438            2 :           default:
    3439            2 :             return !flag_associative_math;
    3440              :           }
    3441              :     }
    3442              : 
    3443      5963398 :   if (INTEGRAL_TYPE_P (type))
    3444      5962515 :     return (!code.is_tree_code ()
    3445      5962515 :             || !operation_no_trapping_overflow (type, tree_code (code)));
    3446              : 
    3447          883 :   if (SAT_FIXED_POINT_TYPE_P (type))
    3448              :     return true;
    3449              : 
    3450              :   return false;
    3451              : }
    3452              : 
    3453              : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
    3454              :    has a handled computation expression.  Store the main reduction
    3455              :    operation in *CODE.  */
    3456              : 
    3457              : static bool
    3458       101961 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
    3459              :                       tree loop_arg, code_helper *code,
    3460              :                       vec<std::pair<ssa_op_iter, use_operand_p> > &path,
    3461              :                       bool inner_loop_of_double_reduc)
    3462              : {
    3463       101961 :   auto_bitmap visited;
    3464       101961 :   tree lookfor = PHI_RESULT (phi);
    3465       101961 :   ssa_op_iter curri;
    3466       101961 :   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
    3467       212077 :   while (USE_FROM_PTR (curr) != loop_arg)
    3468         8155 :     curr = op_iter_next_use (&curri);
    3469       101961 :   curri.i = curri.numops;
    3470       949228 :   do
    3471              :     {
    3472       949228 :       path.safe_push (std::make_pair (curri, curr));
    3473       949228 :       tree use = USE_FROM_PTR (curr);
    3474       949228 :       if (use == lookfor)
    3475              :         break;
    3476       847677 :       gimple *def = SSA_NAME_DEF_STMT (use);
    3477       847677 :       if (gimple_nop_p (def)
    3478       847677 :           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
    3479              :         {
    3480       713432 : pop:
    3481       713432 :           do
    3482              :             {
    3483       713432 :               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
    3484       713432 :               curri = x.first;
    3485       713432 :               curr = x.second;
    3486       780850 :               do
    3487       780850 :                 curr = op_iter_next_use (&curri);
    3488              :               /* Skip already visited or non-SSA operands (from iterating
    3489              :                  over PHI args).  */
    3490              :               while (curr != NULL_USE_OPERAND_P
    3491      1561700 :                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
    3492       269813 :                          || ! bitmap_set_bit (visited,
    3493       269813 :                                               SSA_NAME_VERSION
    3494              :                                                 (USE_FROM_PTR (curr)))));
    3495              :             }
    3496      1426864 :           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
    3497       238660 :           if (curr == NULL_USE_OPERAND_P)
    3498              :             break;
    3499              :         }
    3500              :       else
    3501              :         {
    3502       712761 :           if (gimple_code (def) == GIMPLE_PHI)
    3503        72327 :             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
    3504              :           else
    3505       640434 :             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
    3506              :           while (curr != NULL_USE_OPERAND_P
    3507       850888 :                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
    3508       742007 :                      || ! bitmap_set_bit (visited,
    3509       742007 :                                           SSA_NAME_VERSION
    3510              :                                             (USE_FROM_PTR (curr)))))
    3511       138127 :             curr = op_iter_next_use (&curri);
    3512       712761 :           if (curr == NULL_USE_OPERAND_P)
    3513       103744 :             goto pop;
    3514              :         }
    3515              :     }
    3516              :   while (1);
    3517       101961 :   if (dump_file && (dump_flags & TDF_DETAILS))
    3518              :     {
    3519         4111 :       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
    3520         4111 :       unsigned i;
    3521         4111 :       std::pair<ssa_op_iter, use_operand_p> *x;
    3522        13974 :       FOR_EACH_VEC_ELT (path, i, x)
    3523         9863 :         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
    3524         4111 :       dump_printf (MSG_NOTE, "\n");
    3525              :     }
    3526              : 
    3527              :   /* Check whether the reduction path detected is valid.  */
    3528       101961 :   bool fail = path.length () == 0;
    3529       101961 :   bool neg = false;
    3530       101961 :   int sign = -1;
    3531       101961 :   *code = ERROR_MARK;
    3532       219475 :   for (unsigned i = 1; i < path.length (); ++i)
    3533              :     {
    3534       124207 :       gimple *use_stmt = USE_STMT (path[i].second);
    3535       124207 :       gimple_match_op op;
    3536       124207 :       if (!gimple_extract_op (use_stmt, &op))
    3537              :         {
    3538              :           fail = true;
    3539         6693 :           break;
    3540              :         }
    3541       123304 :       unsigned int opi = op.num_ops;
    3542       123304 :       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
    3543              :         {
    3544              :           /* The following make sure we can compute the operand index
    3545              :              easily plus it mostly disallows chaining via COND_EXPR condition
    3546              :              operands.  */
    3547       190654 :           for (opi = 0; opi < op.num_ops; ++opi)
    3548       189641 :             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
    3549              :               break;
    3550              :         }
    3551         6230 :       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
    3552              :         {
    3553        12485 :           for (opi = 0; opi < op.num_ops; ++opi)
    3554        12485 :             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
    3555              :               break;
    3556              :         }
    3557       123304 :       if (opi == op.num_ops)
    3558              :         {
    3559              :           fail = true;
    3560              :           break;
    3561              :         }
    3562       122291 :       op.code = canonicalize_code (op.code, op.type);
    3563       122291 :       if (op.code == MINUS_EXPR)
    3564              :         {
    3565         5668 :           op.code = PLUS_EXPR;
    3566              :           /* Track whether we negate the reduction value each iteration.  */
    3567         5668 :           if (op.ops[1] == op.ops[opi])
    3568           34 :             neg = ! neg;
    3569              :         }
    3570       116623 :       else if (op.code == IFN_COND_SUB)
    3571              :         {
    3572            9 :           op.code = IFN_COND_ADD;
    3573              :           /* Track whether we negate the reduction value each iteration.  */
    3574            9 :           if (op.ops[2] == op.ops[opi])
    3575            0 :             neg = ! neg;
    3576              :         }
    3577              :       /* For an FMA the reduction code is the PLUS if the addition chain
    3578              :          is the reduction.  */
    3579       116614 :       else if (op.code == IFN_FMA && opi == 2)
    3580           33 :         op.code = PLUS_EXPR;
    3581       122291 :       if (CONVERT_EXPR_CODE_P (op.code)
    3582       122291 :           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
    3583              :         ;
    3584       116813 :       else if (*code == ERROR_MARK)
    3585              :         {
    3586        99732 :           *code = op.code;
    3587        99732 :           sign = TYPE_SIGN (op.type);
    3588              :         }
    3589        17081 :       else if (op.code != *code)
    3590              :         {
    3591              :           fail = true;
    3592              :           break;
    3593              :         }
    3594        15761 :       else if ((op.code == MIN_EXPR
    3595        15605 :                 || op.code == MAX_EXPR)
    3596        15776 :                && sign != TYPE_SIGN (op.type))
    3597              :         {
    3598              :           fail = true;
    3599              :           break;
    3600              :         }
    3601              :       /* Check there's only a single stmt the op is used on.  For the
    3602              :          not value-changing tail and the last stmt allow out-of-loop uses,
    3603              :          but not when this is the inner loop of a double reduction.
    3604              :          ???  We could relax this and handle arbitrary live stmts by
    3605              :          forcing a scalar epilogue for example.  */
    3606       120968 :       imm_use_iterator imm_iter;
    3607       120968 :       use_operand_p use_p;
    3608       120968 :       gimple *op_use_stmt;
    3609       120968 :       unsigned cnt = 0;
    3610       127163 :       bool cond_fn_p = op.code.is_internal_fn ()
    3611         6195 :         && (conditional_internal_fn_code (internal_fn (op.code))
    3612       120968 :             != ERROR_MARK);
    3613              : 
    3614       411222 :       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
    3615              :         {
    3616              :           /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
    3617              :              have op1 twice (once as definition, once as else) in the same
    3618              :              operation.  Enforce this.  */
    3619       169286 :           if (cond_fn_p && op_use_stmt == use_stmt)
    3620              :             {
    3621         6129 :               gcall *call = as_a<gcall *> (use_stmt);
    3622         6129 :               unsigned else_pos
    3623         6129 :                 = internal_fn_else_index (internal_fn (op.code));
    3624         6129 :               if (gimple_call_arg (call, else_pos) != op.ops[opi])
    3625              :                 {
    3626              :                   fail = true;
    3627              :                   break;
    3628              :                 }
    3629        30645 :               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
    3630              :                 {
    3631        24516 :                   if (j == else_pos)
    3632         6129 :                     continue;
    3633        18387 :                   if (gimple_call_arg (call, j) == op.ops[opi])
    3634         6129 :                     cnt++;
    3635              :                 }
    3636              :             }
    3637       163157 :           else if (!is_gimple_debug (op_use_stmt)
    3638       163157 :                    && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
    3639         2813 :                        || flow_bb_inside_loop_p (loop,
    3640         2813 :                                                  gimple_bb (op_use_stmt))))
    3641       236733 :             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
    3642       118371 :               cnt++;
    3643       120968 :         }
    3644              : 
    3645       120968 :       if (cnt != 1)
    3646              :         {
    3647              :           fail = true;
    3648              :           break;
    3649              :         }
    3650              :     }
    3651       109075 :   return ! fail && ! neg && *code != ERROR_MARK;
    3652       101961 : }
    3653              : 
    3654              : bool
    3655           21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
    3656              :                       tree loop_arg, enum tree_code code)
    3657              : {
    3658           21 :   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
    3659           21 :   code_helper code_;
    3660           21 :   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
    3661           21 :           && code_ == code);
    3662           21 : }
    3663              : 
    3664              : 
    3665              : 
    3666              : /* Function vect_is_simple_reduction
    3667              : 
    3668              :    (1) Detect a cross-iteration def-use cycle that represents a simple
    3669              :    reduction computation.  We look for the following pattern:
    3670              : 
    3671              :    loop_header:
    3672              :      a1 = phi < a0, a2 >
    3673              :      a3 = ...
    3674              :      a2 = operation (a3, a1)
    3675              : 
    3676              :    or
    3677              : 
    3678              :    a3 = ...
    3679              :    loop_header:
    3680              :      a1 = phi < a0, a2 >
    3681              :      a2 = operation (a3, a1)
    3682              : 
    3683              :    such that:
    3684              :    1. operation is commutative and associative and it is safe to
    3685              :       change the order of the computation
    3686              :    2. no uses for a2 in the loop (a2 is used out of the loop)
    3687              :    3. no uses of a1 in the loop besides the reduction operation
    3688              :    4. no uses of a1 outside the loop.
    3689              : 
    3690              :    Conditions 1,4 are tested here.
    3691              :    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
    3692              : 
    3693              :    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
    3694              :    nested cycles.
    3695              : 
    3696              :    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
    3697              :    reductions:
    3698              : 
    3699              :      a1 = phi < a0, a2 >
    3700              :      inner loop (def of a3)
    3701              :      a2 = phi < a3 >
    3702              : 
    3703              :    (4) Detect condition expressions, ie:
    3704              :      for (int i = 0; i < N; i++)
    3705              :        if (a[i] < val)
    3706              :         ret_val = a[i];
    3707              : 
    3708              : */
    3709              : 
    3710              : static stmt_vec_info
    3711       164677 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
    3712              :                           gphi **double_reduc)
    3713              : {
    3714       164677 :   gphi *phi = as_a <gphi *> (phi_info->stmt);
    3715       164677 :   gimple *phi_use_stmt = NULL;
    3716       164677 :   imm_use_iterator imm_iter;
    3717       164677 :   use_operand_p use_p;
    3718              : 
    3719              :   /* When double_reduc is NULL we are testing the inner loop of a
    3720              :      double reduction.  */
    3721       164677 :   bool inner_loop_of_double_reduc = double_reduc == NULL;
    3722       164677 :   if (double_reduc)
    3723       163570 :     *double_reduc = NULL;
    3724       164677 :   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
    3725              : 
    3726       164677 :   tree phi_name = PHI_RESULT (phi);
    3727              :   /* ???  If there are no uses of the PHI result the inner loop reduction
    3728              :      won't be detected as possibly double-reduction by vectorizable_reduction
    3729              :      because that tries to walk the PHI arg from the preheader edge which
    3730              :      can be constant.  See PR60382.  */
    3731       164677 :   if (has_zero_uses (phi_name))
    3732              :     return NULL;
    3733       164541 :   class loop *loop = (gimple_bb (phi))->loop_father;
    3734       164541 :   unsigned nphi_def_loop_uses = 0;
    3735       621914 :   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
    3736              :     {
    3737       304439 :       gimple *use_stmt = USE_STMT (use_p);
    3738       304439 :       if (is_gimple_debug (use_stmt))
    3739        82616 :         continue;
    3740              : 
    3741       221823 :       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
    3742              :         {
    3743        11607 :           if (dump_enabled_p ())
    3744           35 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    3745              :                              "intermediate value used outside loop.\n");
    3746              : 
    3747        11607 :           return NULL;
    3748              :         }
    3749              : 
    3750              :       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
    3751              :          op1 twice (once as definition, once as else) in the same operation.
    3752              :          Only count it as one. */
    3753       210216 :       if (use_stmt != phi_use_stmt)
    3754              :         {
    3755       203679 :           nphi_def_loop_uses++;
    3756       203679 :           phi_use_stmt = use_stmt;
    3757              :         }
    3758        11607 :     }
    3759              : 
    3760       152934 :   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
    3761       152934 :   if (TREE_CODE (latch_def) != SSA_NAME)
    3762              :     {
    3763         1448 :       if (dump_enabled_p ())
    3764            8 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    3765              :                          "reduction: not ssa_name: %T\n", latch_def);
    3766         1448 :       return NULL;
    3767              :     }
    3768              : 
    3769       151486 :   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
    3770       151486 :   if (!def_stmt_info
    3771       151486 :       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
    3772          161 :     return NULL;
    3773              : 
    3774       151325 :   bool nested_in_vect_loop
    3775       151325 :     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
    3776       151325 :   unsigned nlatch_def_loop_uses = 0;
    3777       151325 :   auto_vec<gphi *, 3> lcphis;
    3778       743930 :   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
    3779              :     {
    3780       441280 :       gimple *use_stmt = USE_STMT (use_p);
    3781       441280 :       if (is_gimple_debug (use_stmt))
    3782       135828 :         continue;
    3783       305452 :       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
    3784       190578 :         nlatch_def_loop_uses++;
    3785              :       else
    3786              :         /* We can have more than one loop-closed PHI.  */
    3787       114874 :         lcphis.safe_push (as_a <gphi *> (use_stmt));
    3788       151325 :     }
    3789              : 
    3790              :   /* If we are vectorizing an inner reduction we are executing that
    3791              :      in the original order only in case we are not dealing with a
    3792              :      double reduction.  */
    3793       151325 :   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
    3794              :     {
    3795         2431 :       if (dump_enabled_p ())
    3796          434 :         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
    3797              :                         "detected nested cycle: ");
    3798         2431 :       return def_stmt_info;
    3799              :     }
    3800              : 
    3801              :   /* When the inner loop of a double reduction ends up with more than
    3802              :      one loop-closed PHI we have failed to classify alternate such
    3803              :      PHIs as double reduction, leading to wrong code.  See PR103237.  */
    3804       149989 :   if (inner_loop_of_double_reduc && lcphis.length () != 1)
    3805              :     {
    3806            1 :       if (dump_enabled_p ())
    3807            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    3808              :                          "unhandle double reduction\n");
    3809            1 :       return NULL;
    3810              :     }
    3811              : 
    3812              :   /* If this isn't a nested cycle or if the nested cycle reduction value
    3813              :      is used outside of the inner loop we cannot handle uses of the reduction
    3814              :      value.  */
    3815       148893 :   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
    3816              :     {
    3817        45591 :       if (dump_enabled_p ())
    3818          403 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    3819              :                          "reduction used in loop.\n");
    3820        45591 :       return NULL;
    3821              :     }
    3822              : 
    3823              :   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
    3824              :      defined in the inner loop.  */
    3825       103302 :   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
    3826              :     {
    3827         1362 :       tree op1 = PHI_ARG_DEF (def_stmt, 0);
    3828         1362 :       if (gimple_phi_num_args (def_stmt) != 1
    3829         1362 :           || TREE_CODE (op1) != SSA_NAME)
    3830              :         {
    3831           91 :           if (dump_enabled_p ())
    3832            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    3833              :                              "unsupported phi node definition.\n");
    3834              : 
    3835           91 :           return NULL;
    3836              :         }
    3837              : 
    3838              :       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
    3839              :          and the latch definition op1.  */
    3840         1271 :       gimple *def1 = SSA_NAME_DEF_STMT (op1);
    3841         1271 :       if (gimple_bb (def1)
    3842         1271 :           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
    3843         1271 :           && loop->inner
    3844         1217 :           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
    3845         1217 :           && (is_gimple_assign (def1) || is_gimple_call (def1))
    3846         1208 :           && is_a <gphi *> (phi_use_stmt)
    3847         1196 :           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
    3848         1196 :           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
    3849              :                                             loop_latch_edge (loop->inner)))
    3850         2465 :           && lcphis.length () == 1)
    3851              :         {
    3852         1107 :           if (dump_enabled_p ())
    3853          144 :             report_vect_op (MSG_NOTE, def_stmt,
    3854              :                             "detected double reduction: ");
    3855              : 
    3856         1107 :           *double_reduc = as_a <gphi *> (phi_use_stmt);
    3857         1107 :           return def_stmt_info;
    3858              :         }
    3859              : 
    3860          164 :       return NULL;
    3861              :     }
    3862              : 
    3863              :   /* Look for the expression computing latch_def from then loop PHI result.  */
    3864       101940 :   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
    3865       101940 :   code_helper code;
    3866       101940 :   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
    3867              :                             path, inner_loop_of_double_reduc))
    3868              :     {
    3869        94826 :       STMT_VINFO_REDUC_CODE (phi_info) = code;
    3870        94826 :       if (code == COND_EXPR && !nested_in_vect_loop)
    3871         8251 :         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
    3872              : 
    3873              :       /* Fill in STMT_VINFO_REDUC_IDX.  */
    3874        94826 :       unsigned i;
    3875       305505 :       for (i = path.length () - 1; i >= 1; --i)
    3876              :         {
    3877       115853 :           gimple *stmt = USE_STMT (path[i].second);
    3878       115853 :           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
    3879       115853 :           gimple_match_op op;
    3880       115853 :           if (!gimple_extract_op (stmt, &op))
    3881            0 :             gcc_unreachable ();
    3882       115853 :           if (gassign *assign = dyn_cast<gassign *> (stmt))
    3883       109643 :             STMT_VINFO_REDUC_IDX (stmt_info)
    3884       109643 :               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
    3885              :           else
    3886              :             {
    3887         6210 :               gcall *call = as_a<gcall *> (stmt);
    3888         6210 :               STMT_VINFO_REDUC_IDX (stmt_info)
    3889         6210 :                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
    3890              :             }
    3891              :         }
    3892        94826 :       if (dump_enabled_p ())
    3893         4104 :         dump_printf_loc (MSG_NOTE, vect_location,
    3894              :                          "reduction: detected reduction\n");
    3895              : 
    3896        94826 :       return def_stmt_info;
    3897              :     }
    3898              : 
    3899         7114 :   if (dump_enabled_p ())
    3900           89 :     dump_printf_loc (MSG_NOTE, vect_location,
    3901              :                      "reduction: unknown pattern\n");
    3902              : 
    3903              :   return NULL;
    3904       253265 : }
    3905              : 
    3906              : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
    3907              :    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
    3908              :    or -1 if not known.  */
    3909              : 
    3910              : static int
    3911       481820 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
    3912              : {
    3913       481820 :   int assumed_vf = vect_vf_for_cost (loop_vinfo);
    3914       481820 :   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
    3915              :     {
    3916       204120 :       if (dump_enabled_p ())
    3917         3587 :         dump_printf_loc (MSG_NOTE, vect_location,
    3918              :                          "cost model: epilogue peel iters set to vf/2 "
    3919              :                          "because loop iterations are unknown .\n");
    3920       204120 :       return assumed_vf / 2;
    3921              :     }
    3922              :   else
    3923              :     {
    3924       277700 :       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
    3925       277700 :       peel_iters_prologue = MIN (niters, peel_iters_prologue);
    3926       277700 :       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
    3927              :       /* If we need to peel for gaps, but no peeling is required, we have to
    3928              :          peel VF iterations.  */
    3929       277700 :       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
    3930       277700 :         peel_iters_epilogue = assumed_vf;
    3931       277700 :       return peel_iters_epilogue;
    3932              :     }
    3933              : }
    3934              : 
    3935              : /* Calculate cost of peeling the scalar loop PEEL_ITERS_PROLOGUE times for
    3936              :    a prologue and the corresponding times for the epilogue.  */
    3937              : int
    3938       357440 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue)
    3939              : {
    3940       357440 :   int retval = 0;
    3941              : 
    3942       357440 :   int peel_iters_epilogue
    3943       357440 :     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
    3944              : 
    3945       357440 :   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
    3946              :     {
    3947              :       /* If peeled iterations are known but number of scalar loop
    3948              :          iterations are unknown, count a taken branch per peeled loop.  */
    3949       138259 :       if (peel_iters_prologue > 0)
    3950        84384 :         retval = builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
    3951       138259 :       if (peel_iters_epilogue > 0)
    3952       138151 :         retval += builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
    3953              :     }
    3954              : 
    3955       714880 :   retval += ((peel_iters_prologue + peel_iters_epilogue)
    3956       357440 :              * loop_vinfo->scalar_costs->body_cost ());
    3957       714880 :   retval += (((peel_iters_prologue != 0) + (peel_iters_epilogue != 0))
    3958       357440 :              * loop_vinfo->scalar_costs->outside_cost ());
    3959              : 
    3960       357440 :   return retval;
    3961              : }
    3962              : 
    3963              : /* Function vect_estimate_min_profitable_iters
    3964              : 
    3965              :    Return the number of iterations required for the vector version of the
    3966              :    loop to be profitable relative to the cost of the scalar version of the
    3967              :    loop.
    3968              : 
    3969              :    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
    3970              :    of iterations for vectorization.  -1 value means loop vectorization
    3971              :    is not profitable.  This returned value may be used for dynamic
    3972              :    profitability check.
    3973              : 
    3974              :    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
    3975              :    for static check against estimated number of iterations.  */
    3976              : 
    3977              : static void
    3978       141664 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
    3979              :                                     int *ret_min_profitable_niters,
    3980              :                                     int *ret_min_profitable_estimate,
    3981              :                                     unsigned *suggested_unroll_factor)
    3982              : {
    3983       141664 :   int min_profitable_iters;
    3984       141664 :   int min_profitable_estimate;
    3985       141664 :   int peel_iters_prologue;
    3986       141664 :   int peel_iters_epilogue;
    3987       141664 :   unsigned vec_inside_cost = 0;
    3988       141664 :   int vec_outside_cost = 0;
    3989       141664 :   unsigned vec_prologue_cost = 0;
    3990       141664 :   unsigned vec_epilogue_cost = 0;
    3991       141664 :   int scalar_single_iter_cost = 0;
    3992       141664 :   int scalar_outside_cost = 0;
    3993       141664 :   int assumed_vf = vect_vf_for_cost (loop_vinfo);
    3994       141664 :   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
    3995       141664 :   vector_costs *target_cost_data = loop_vinfo->vector_costs;
    3996              : 
    3997              :   /* Cost model disabled.  */
    3998       141664 :   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
    3999              :     {
    4000        16979 :       if (dump_enabled_p ())
    4001        10654 :         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
    4002        16979 :       *ret_min_profitable_niters = 0;
    4003        16979 :       *ret_min_profitable_estimate = 0;
    4004        16979 :       return;
    4005              :     }
    4006              : 
    4007              :   /* Requires loop versioning tests to handle misalignment.  */
    4008       124685 :   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
    4009              :     {
    4010              :       /*  FIXME: Make cost depend on complexity of individual check.  */
    4011           18 :       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
    4012           18 :       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
    4013           18 :       if (dump_enabled_p ())
    4014            2 :         dump_printf (MSG_NOTE,
    4015              :                      "cost model: Adding cost of checks for loop "
    4016              :                      "versioning to treat misalignment.\n");
    4017              :     }
    4018              : 
    4019              :   /* Requires loop versioning with alias checks.  */
    4020       124685 :   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
    4021              :     {
    4022              :       /*  FIXME: Make cost depend on complexity of individual check.  */
    4023         7114 :       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
    4024         7114 :       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
    4025         7114 :       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
    4026            4 :       if (len)
    4027              :         /* Count LEN - 1 ANDs and LEN comparisons.  */
    4028            4 :         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
    4029              :                               scalar_stmt, vect_prologue);
    4030         7114 :       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
    4031         1272 :       if (len)
    4032              :         {
    4033              :           /* Count LEN - 1 ANDs and LEN comparisons.  */
    4034         1272 :           unsigned int nstmts = len * 2 - 1;
    4035              :           /* +1 for each bias that needs adding.  */
    4036         2544 :           for (unsigned int i = 0; i < len; ++i)
    4037         1272 :             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
    4038          151 :               nstmts += 1;
    4039         1272 :           (void) add_stmt_cost (target_cost_data, nstmts,
    4040              :                                 scalar_stmt, vect_prologue);
    4041              :         }
    4042         7114 :       if (dump_enabled_p ())
    4043           32 :         dump_printf (MSG_NOTE,
    4044              :                      "cost model: Adding cost of checks for loop "
    4045              :                      "versioning aliasing.\n");
    4046              :     }
    4047              : 
    4048              :   /* Requires loop versioning with niter checks.  */
    4049       124685 :   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
    4050              :     {
    4051              :       /*  FIXME: Make cost depend on complexity of individual check.  */
    4052          751 :       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
    4053              :                             NULL, NULL, NULL_TREE, 0, vect_prologue);
    4054          751 :       if (dump_enabled_p ())
    4055            1 :         dump_printf (MSG_NOTE,
    4056              :                      "cost model: Adding cost of checks for loop "
    4057              :                      "versioning niters.\n");
    4058              :     }
    4059              : 
    4060       124685 :   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
    4061         7877 :     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
    4062              :                           vect_prologue);
    4063              : 
    4064              :   /* Count statements in scalar loop.  Using this as scalar cost for a single
    4065              :      iteration for now.
    4066              : 
    4067              :      TODO: Add outer loop support.
    4068              : 
    4069              :      TODO: Consider assigning different costs to different scalar
    4070              :      statements.  */
    4071              : 
    4072       124685 :   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
    4073              : 
    4074              :   /* Add additional cost for the peeled instructions in prologue and epilogue
    4075              :      loop.  (For fully-masked loops there will be no peeling.)
    4076              : 
    4077              :      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
    4078              :      at compile-time - we assume it's vf/2 (the worst would be vf-1).
    4079              : 
    4080              :      TODO: Build an expression that represents peel_iters for prologue and
    4081              :      epilogue to be used in a run-time test.  */
    4082              : 
    4083       124685 :   bool prologue_need_br_taken_cost = false;
    4084       124685 :   bool prologue_need_br_not_taken_cost = false;
    4085              : 
    4086              :   /* Calculate peel_iters_prologue.  */
    4087       124685 :   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
    4088              :     peel_iters_prologue = 0;
    4089       124685 :   else if (npeel < 0)
    4090              :     {
    4091          279 :       peel_iters_prologue = assumed_vf / 2;
    4092          279 :       if (dump_enabled_p ())
    4093            8 :         dump_printf (MSG_NOTE, "cost model: "
    4094              :                      "prologue peel iters set to vf/2.\n");
    4095              : 
    4096              :       /* If peeled iterations are unknown, count a taken branch and a not taken
    4097              :          branch per peeled loop.  Even if scalar loop iterations are known,
    4098              :          vector iterations are not known since peeled prologue iterations are
    4099              :          not known.  Hence guards remain the same.  */
    4100              :       prologue_need_br_taken_cost = true;
    4101              :       prologue_need_br_not_taken_cost = true;
    4102              :     }
    4103              :   else
    4104              :     {
    4105       124406 :       peel_iters_prologue = npeel;
    4106       124406 :       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
    4107              :         /* If peeled iterations are known but number of scalar loop
    4108              :            iterations are unknown, count a taken branch per peeled loop.  */
    4109       124685 :         prologue_need_br_taken_cost = true;
    4110              :     }
    4111              : 
    4112       124685 :   bool epilogue_need_br_taken_cost = false;
    4113       124685 :   bool epilogue_need_br_not_taken_cost = false;
    4114              : 
    4115              :   /* Calculate peel_iters_epilogue.  */
    4116       124685 :   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    4117              :     /* We need to peel exactly one iteration for gaps.  */
    4118           26 :     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
    4119       124659 :   else if (npeel < 0)
    4120              :     {
    4121              :       /* If peeling for alignment is unknown, loop bound of main loop
    4122              :          becomes unknown.  */
    4123          279 :       peel_iters_epilogue = assumed_vf / 2;
    4124          279 :       if (dump_enabled_p ())
    4125            8 :         dump_printf (MSG_NOTE, "cost model: "
    4126              :                      "epilogue peel iters set to vf/2 because "
    4127              :                      "peeling for alignment is unknown.\n");
    4128              : 
    4129              :       /* See the same reason above in peel_iters_prologue calculation.  */
    4130              :       epilogue_need_br_taken_cost = true;
    4131              :       epilogue_need_br_not_taken_cost = true;
    4132              :     }
    4133              :   else
    4134              :     {
    4135       124380 :       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
    4136       124380 :       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
    4137              :         /* If peeled iterations are known but number of scalar loop
    4138              :            iterations are unknown, count a taken branch per peeled loop.  */
    4139       124685 :         epilogue_need_br_taken_cost = true;
    4140              :     }
    4141              : 
    4142              :   /* The way we cummulate peeling costs into the vector prologue/epilogue
    4143              :      cost is a bit awkward given we cannot reuse scalar_costs which is
    4144              :      already computed and also because it cannot take into account any
    4145              :      epilogue vectorization we'll carry out in the end.  */
    4146              : 
    4147       124685 :   stmt_info_for_cost *si;
    4148       124685 :   int j;
    4149              :   /* Add costs associated with peel_iters_prologue.  */
    4150       124685 :   if (peel_iters_prologue)
    4151         1068 :     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
    4152              :       {
    4153          775 :         (void) add_stmt_cost (target_cost_data,
    4154          775 :                               si->count * peel_iters_prologue, si->kind,
    4155              :                               si->stmt_info, si->node, si->vectype,
    4156              :                               si->misalign, vect_prologue);
    4157              :       }
    4158              : 
    4159              :   /* Add costs associated with peel_iters_epilogue.  */
    4160       124685 :   if (peel_iters_epilogue)
    4161       387630 :     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
    4162              :       {
    4163       310568 :         (void) add_stmt_cost (target_cost_data,
    4164       310568 :                               si->count * peel_iters_epilogue, si->kind,
    4165              :                               si->stmt_info, si->node, si->vectype,
    4166              :                               si->misalign, vect_epilogue);
    4167              :       }
    4168              : 
    4169              :   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
    4170              : 
    4171       124685 :   if (prologue_need_br_taken_cost)
    4172          279 :     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
    4173              :                           vect_prologue);
    4174              : 
    4175       124685 :   if (prologue_need_br_not_taken_cost)
    4176          279 :     (void) add_stmt_cost (target_cost_data, 1,
    4177              :                           cond_branch_not_taken, vect_prologue);
    4178              : 
    4179       124685 :   if (epilogue_need_br_taken_cost)
    4180        65274 :     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
    4181              :                           vect_epilogue);
    4182              : 
    4183       124685 :   if (epilogue_need_br_not_taken_cost)
    4184          279 :     (void) add_stmt_cost (target_cost_data, 1,
    4185              :                           cond_branch_not_taken, vect_epilogue);
    4186              : 
    4187              :   /* Take care of special costs for rgroup controls of partial vectors.  */
    4188           26 :   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
    4189       124711 :       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
    4190              :           == vect_partial_vectors_avx512))
    4191              :     {
    4192              :       /* Calculate how many masks we need to generate.  */
    4193           26 :       unsigned int num_masks = 0;
    4194           26 :       bool need_saturation = false;
    4195          108 :       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
    4196           30 :         if (rgm.type)
    4197              :           {
    4198           26 :             unsigned nvectors = rgm.factor;
    4199           26 :             num_masks += nvectors;
    4200           26 :             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
    4201           26 :                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
    4202            9 :               need_saturation = true;
    4203              :           }
    4204              : 
    4205              :       /* ???  The target isn't able to identify the costs below as
    4206              :          producing masks so it cannot penaltize cases where we'd run
    4207              :          out of mask registers for example.  */
    4208              : 
    4209              :       /* ???  We are also failing to account for smaller vector masks
    4210              :          we generate by splitting larger masks in vect_get_loop_mask.  */
    4211              : 
    4212              :       /* In the worst case, we need to generate each mask in the prologue
    4213              :          and in the loop body.  We need one splat per group and one
    4214              :          compare per mask.
    4215              : 
    4216              :          Sometimes the prologue mask will fold to a constant,
    4217              :          so the actual prologue cost might be smaller.  However, it's
    4218              :          simpler and safer to use the worst-case cost; if this ends up
    4219              :          being the tie-breaker between vectorizing or not, then it's
    4220              :          probably better not to vectorize.  */
    4221           26 :       (void) add_stmt_cost (target_cost_data,
    4222              :                             num_masks
    4223           26 :                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
    4224              :                             vector_stmt, NULL, NULL, NULL_TREE, 0,
    4225              :                             vect_prologue);
    4226           52 :       (void) add_stmt_cost (target_cost_data,
    4227              :                             num_masks
    4228           52 :                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
    4229              :                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
    4230              : 
    4231              :       /* When we need saturation we need it both in the prologue and
    4232              :          the epilogue.  */
    4233           26 :       if (need_saturation)
    4234              :         {
    4235            9 :           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
    4236              :                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
    4237            9 :           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
    4238              :                                 NULL, NULL, NULL_TREE, 0, vect_body);
    4239              :         }
    4240              :     }
    4241            0 :   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
    4242       124659 :            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
    4243              :                == vect_partial_vectors_while_ult))
    4244              :     {
    4245              :       /* Calculate how many masks we need to generate.  */
    4246              :       unsigned int num_masks = 0;
    4247              :       rgroup_controls *rgm;
    4248              :       unsigned int num_vectors_m1;
    4249            0 :       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
    4250              :                         num_vectors_m1, rgm)
    4251            0 :         if (rgm->type)
    4252            0 :           num_masks += num_vectors_m1 + 1;
    4253            0 :       gcc_assert (num_masks > 0);
    4254              : 
    4255              :       /* In the worst case, we need to generate each mask in the prologue
    4256              :          and in the loop body.  One of the loop body mask instructions
    4257              :          replaces the comparison in the scalar loop, and since we don't
    4258              :          count the scalar comparison against the scalar body, we shouldn't
    4259              :          count that vector instruction against the vector body either.
    4260              : 
    4261              :          Sometimes we can use unpacks instead of generating prologue
    4262              :          masks and sometimes the prologue mask will fold to a constant,
    4263              :          so the actual prologue cost might be smaller.  However, it's
    4264              :          simpler and safer to use the worst-case cost; if this ends up
    4265              :          being the tie-breaker between vectorizing or not, then it's
    4266              :          probably better not to vectorize.  */
    4267            0 :       (void) add_stmt_cost (target_cost_data, num_masks,
    4268              :                             vector_stmt, NULL, NULL, NULL_TREE, 0,
    4269              :                             vect_prologue);
    4270            0 :       (void) add_stmt_cost (target_cost_data, num_masks - 1,
    4271              :                             vector_stmt, NULL, NULL, NULL_TREE, 0,
    4272              :                             vect_body);
    4273              :     }
    4274       124659 :   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
    4275              :     {
    4276              :       /* Referring to the functions vect_set_loop_condition_partial_vectors
    4277              :          and vect_set_loop_controls_directly, we need to generate each
    4278              :          length in the prologue and in the loop body if required. Although
    4279              :          there are some possible optimizations, we consider the worst case
    4280              :          here.  */
    4281              : 
    4282            0 :       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
    4283            0 :       signed char partial_load_store_bias
    4284              :         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
    4285            0 :       bool need_iterate_p
    4286            0 :         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
    4287            0 :            && !vect_known_niters_smaller_than_vf (loop_vinfo));
    4288              : 
    4289              :       /* Calculate how many statements to be added.  */
    4290            0 :       unsigned int prologue_stmts = 0;
    4291            0 :       unsigned int body_stmts = 0;
    4292              : 
    4293            0 :       rgroup_controls *rgc;
    4294            0 :       unsigned int num_vectors_m1;
    4295            0 :       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
    4296            0 :         if (rgc->type)
    4297              :           {
    4298              :             /* May need one SHIFT for nitems_total computation.  */
    4299            0 :             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
    4300            0 :             if (nitems != 1 && !niters_known_p)
    4301            0 :               prologue_stmts += 1;
    4302              : 
    4303              :             /* May need one MAX and one MINUS for wrap around.  */
    4304            0 :             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
    4305            0 :               prologue_stmts += 2;
    4306              : 
    4307              :             /* Need one MAX and one MINUS for each batch limit excepting for
    4308              :                the 1st one.  */
    4309            0 :             prologue_stmts += num_vectors_m1 * 2;
    4310              : 
    4311            0 :             unsigned int num_vectors = num_vectors_m1 + 1;
    4312              : 
    4313              :             /* Need to set up lengths in prologue, only one MIN required
    4314              :                for each since start index is zero.  */
    4315            0 :             prologue_stmts += num_vectors;
    4316              : 
    4317              :             /* If we have a non-zero partial load bias, we need one PLUS
    4318              :                to adjust the load length.  */
    4319            0 :             if (partial_load_store_bias != 0)
    4320            0 :               body_stmts += 1;
    4321              : 
    4322            0 :             unsigned int length_update_cost = 0;
    4323            0 :             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
    4324              :               /* For decrement IV style, Each only need a single SELECT_VL
    4325              :                  or MIN since beginning to calculate the number of elements
    4326              :                  need to be processed in current iteration.  */
    4327              :               length_update_cost = 1;
    4328              :             else
    4329              :               /* For increment IV stype, Each may need two MINs and one MINUS to
    4330              :                  update lengths in body for next iteration.  */
    4331            0 :               length_update_cost = 3;
    4332              : 
    4333            0 :             if (need_iterate_p)
    4334            0 :               body_stmts += length_update_cost * num_vectors;
    4335              :           }
    4336              : 
    4337            0 :       (void) add_stmt_cost (target_cost_data, prologue_stmts,
    4338              :                             scalar_stmt, vect_prologue);
    4339            0 :       (void) add_stmt_cost (target_cost_data, body_stmts,
    4340              :                             scalar_stmt, vect_body);
    4341              :     }
    4342              : 
    4343              :   /* FORNOW: The scalar outside cost is incremented in one of the
    4344              :      following ways:
    4345              : 
    4346              :      1. The vectorizer checks for alignment and aliasing and generates
    4347              :      a condition that allows dynamic vectorization.  A cost model
    4348              :      check is ANDED with the versioning condition.  Hence scalar code
    4349              :      path now has the added cost of the versioning check.
    4350              : 
    4351              :        if (cost > th & versioning_check)
    4352              :          jmp to vector code
    4353              : 
    4354              :      Hence run-time scalar is incremented by not-taken branch cost.
    4355              : 
    4356              :      2. The vectorizer then checks if a prologue is required.  If the
    4357              :      cost model check was not done before during versioning, it has to
    4358              :      be done before the prologue check.
    4359              : 
    4360              :        if (cost <= th)
    4361              :          prologue = scalar_iters
    4362              :        if (prologue == 0)
    4363              :          jmp to vector code
    4364              :        else
    4365              :          execute prologue
    4366              :        if (prologue == num_iters)
    4367              :          go to exit
    4368              : 
    4369              :      Hence the run-time scalar cost is incremented by a taken branch,
    4370              :      plus a not-taken branch, plus a taken branch cost.
    4371              : 
    4372              :      3. The vectorizer then checks if an epilogue is required.  If the
    4373              :      cost model check was not done before during prologue check, it
    4374              :      has to be done with the epilogue check.
    4375              : 
    4376              :        if (prologue == 0)
    4377              :          jmp to vector code
    4378              :        else
    4379              :          execute prologue
    4380              :        if (prologue == num_iters)
    4381              :          go to exit
    4382              :        vector code:
    4383              :          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
    4384              :            jmp to epilogue
    4385              : 
    4386              :      Hence the run-time scalar cost should be incremented by 2 taken
    4387              :      branches.
    4388              : 
    4389              :      TODO: The back end may reorder the BBS's differently and reverse
    4390              :      conditions/branch directions.  Change the estimates below to
    4391              :      something more reasonable.  */
    4392              : 
    4393              :   /* If the number of iterations is known and we do not do versioning, we can
    4394              :      decide whether to vectorize at compile time.  Hence the scalar version
    4395              :      do not carry cost model guard costs.  */
    4396        58567 :   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
    4397       183252 :       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
    4398              :     {
    4399              :       /* Cost model check occurs at versioning.  */
    4400        67209 :       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
    4401         7877 :         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
    4402              :       else
    4403              :         {
    4404              :           /* Cost model check occurs at prologue generation.  */
    4405        59332 :           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
    4406          152 :             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
    4407          152 :               + vect_get_stmt_cost (cond_branch_not_taken);
    4408              :           /* Cost model check occurs at epilogue generation.  */
    4409              :           else
    4410        59180 :             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
    4411              :         }
    4412              :     }
    4413              : 
    4414              :   /* Complete the target-specific cost calculations.  */
    4415       124685 :   loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
    4416       124685 :   vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
    4417       124685 :   vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
    4418       124685 :   vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
    4419       124685 :   if (suggested_unroll_factor)
    4420       124298 :     *suggested_unroll_factor
    4421       124298 :       = loop_vinfo->vector_costs->suggested_unroll_factor ();
    4422              : 
    4423       124298 :   if (suggested_unroll_factor && *suggested_unroll_factor > 1
    4424          416 :       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
    4425            0 :       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
    4426              :                     *suggested_unroll_factor,
    4427              :                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
    4428              :     {
    4429            0 :       if (dump_enabled_p ())
    4430            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4431              :                          "can't unroll as unrolled vectorization factor larger"
    4432              :                          " than maximum vectorization factor: "
    4433              :                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
    4434              :                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
    4435            0 :       *suggested_unroll_factor = 1;
    4436              :     }
    4437              : 
    4438       124685 :   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
    4439              : 
    4440       124685 :   if (dump_enabled_p ())
    4441              :     {
    4442         1087 :       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
    4443         1087 :       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
    4444              :                    vec_inside_cost);
    4445         1087 :       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
    4446              :                    vec_prologue_cost);
    4447         1087 :       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
    4448              :                    vec_epilogue_cost);
    4449         1087 :       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
    4450              :                    scalar_single_iter_cost);
    4451         1087 :       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
    4452              :                    scalar_outside_cost);
    4453         1087 :       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
    4454              :                    vec_outside_cost);
    4455         1087 :       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
    4456              :                    peel_iters_prologue);
    4457         1087 :       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
    4458              :                    peel_iters_epilogue);
    4459              :     }
    4460              : 
    4461              :   /* Calculate number of iterations required to make the vector version
    4462              :      profitable, relative to the loop bodies only.  The following condition
    4463              :      must hold true:
    4464              :      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
    4465              :      where
    4466              :      SIC = scalar iteration cost, VIC = vector iteration cost,
    4467              :      VOC = vector outside cost, VF = vectorization factor,
    4468              :      NPEEL = prologue iterations + epilogue iterations,
    4469              :      SOC = scalar outside cost for run time cost model check.  */
    4470              : 
    4471       124685 :   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
    4472       124685 :                           - vec_inside_cost);
    4473       124685 :   if (saving_per_viter <= 0)
    4474              :     {
    4475        23820 :       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
    4476            0 :         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
    4477              :                     "vectorization did not happen for a simd loop");
    4478              : 
    4479        23820 :       if (dump_enabled_p ())
    4480           30 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4481              :                          "cost model: the vector iteration cost = %d "
    4482              :                          "divided by the scalar iteration cost = %d "
    4483              :                          "is greater or equal to the vectorization factor = %d"
    4484              :                          ".\n",
    4485              :                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
    4486        23820 :       *ret_min_profitable_niters = -1;
    4487        23820 :       *ret_min_profitable_estimate = -1;
    4488        23820 :       return;
    4489              :     }
    4490              : 
    4491              :   /* ??? The "if" arm is written to handle all cases; see below for what
    4492              :      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
    4493       100865 :   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    4494              :     {
    4495              :       /* Rewriting the condition above in terms of the number of
    4496              :          vector iterations (vniters) rather than the number of
    4497              :          scalar iterations (niters) gives:
    4498              : 
    4499              :          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
    4500              : 
    4501              :          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
    4502              : 
    4503              :          For integer N, X and Y when X > 0:
    4504              : 
    4505              :          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
    4506           18 :       int outside_overhead = (vec_outside_cost
    4507           18 :                               - scalar_single_iter_cost * peel_iters_prologue
    4508           18 :                               - scalar_single_iter_cost * peel_iters_epilogue
    4509              :                               - scalar_outside_cost);
    4510              :       /* We're only interested in cases that require at least one
    4511              :          vector iteration.  */
    4512           18 :       int min_vec_niters = 1;
    4513           18 :       if (outside_overhead > 0)
    4514           13 :         min_vec_niters = outside_overhead / saving_per_viter + 1;
    4515              : 
    4516           18 :       if (dump_enabled_p ())
    4517            7 :         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
    4518              :                      min_vec_niters);
    4519              : 
    4520           18 :       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    4521              :         {
    4522              :           /* Now that we know the minimum number of vector iterations,
    4523              :              find the minimum niters for which the scalar cost is larger:
    4524              : 
    4525              :              SIC * niters > VIC * vniters + VOC - SOC
    4526              : 
    4527              :              We know that the minimum niters is no more than
    4528              :              vniters * VF + NPEEL, but it might be (and often is) less
    4529              :              than that if a partial vector iteration is cheaper than the
    4530              :              equivalent scalar code.  */
    4531           18 :           int threshold = (vec_inside_cost * min_vec_niters
    4532           18 :                            + vec_outside_cost
    4533           18 :                            - scalar_outside_cost);
    4534           18 :           if (threshold <= 0)
    4535              :             min_profitable_iters = 1;
    4536              :           else
    4537           18 :             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
    4538              :         }
    4539              :       else
    4540              :         /* Convert the number of vector iterations into a number of
    4541              :            scalar iterations.  */
    4542            0 :         min_profitable_iters = (min_vec_niters * assumed_vf
    4543            0 :                                 + peel_iters_prologue
    4544              :                                 + peel_iters_epilogue);
    4545              :     }
    4546              :   else
    4547              :     {
    4548       100847 :       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
    4549       100847 :                               * assumed_vf
    4550       100847 :                               - vec_inside_cost * peel_iters_prologue
    4551       100847 :                               - vec_inside_cost * peel_iters_epilogue);
    4552       100847 :       if (min_profitable_iters <= 0)
    4553              :         min_profitable_iters = 0;
    4554              :       else
    4555              :         {
    4556        85838 :           min_profitable_iters /= saving_per_viter;
    4557              : 
    4558        85838 :           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
    4559        85838 :               <= (((int) vec_inside_cost * min_profitable_iters)
    4560        85838 :                   + (((int) vec_outside_cost - scalar_outside_cost)
    4561              :                      * assumed_vf)))
    4562        85838 :             min_profitable_iters++;
    4563              :         }
    4564              :     }
    4565              : 
    4566       100865 :   if (dump_enabled_p ())
    4567         1057 :     dump_printf (MSG_NOTE,
    4568              :                  "  Calculated minimum iters for profitability: %d\n",
    4569              :                  min_profitable_iters);
    4570              : 
    4571       100865 :   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
    4572       100847 :       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
    4573              :     /* We want the vectorized loop to execute at least once.  */
    4574              :     min_profitable_iters = assumed_vf + peel_iters_prologue;
    4575        22061 :   else if (min_profitable_iters < peel_iters_prologue)
    4576              :     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
    4577              :        vectorized loop executes at least once.  */
    4578              :     min_profitable_iters = peel_iters_prologue;
    4579              : 
    4580       100865 :   if (dump_enabled_p ())
    4581         1057 :     dump_printf_loc (MSG_NOTE, vect_location,
    4582              :                      "  Runtime profitability threshold = %d\n",
    4583              :                      min_profitable_iters);
    4584              : 
    4585       100865 :   *ret_min_profitable_niters = min_profitable_iters;
    4586              : 
    4587              :   /* Calculate number of iterations required to make the vector version
    4588              :      profitable, relative to the loop bodies only.
    4589              : 
    4590              :      Non-vectorized variant is SIC * niters and it must win over vector
    4591              :      variant on the expected loop trip count.  The following condition must hold true:
    4592              :      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
    4593              : 
    4594       100865 :   if (vec_outside_cost <= 0)
    4595              :     min_profitable_estimate = 0;
    4596              :   /* ??? This "else if" arm is written to handle all cases; see below for
    4597              :      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
    4598        90305 :   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    4599              :     {
    4600              :       /* This is a repeat of the code above, but with + SOC rather
    4601              :          than - SOC.  */
    4602           18 :       int outside_overhead = (vec_outside_cost
    4603           18 :                               - scalar_single_iter_cost * peel_iters_prologue
    4604           18 :                               - scalar_single_iter_cost * peel_iters_epilogue
    4605              :                               + scalar_outside_cost);
    4606           18 :       int min_vec_niters = 1;
    4607           18 :       if (outside_overhead > 0)
    4608           18 :         min_vec_niters = outside_overhead / saving_per_viter + 1;
    4609              : 
    4610           18 :       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    4611              :         {
    4612           18 :           int threshold = (vec_inside_cost * min_vec_niters
    4613           18 :                            + vec_outside_cost
    4614           18 :                            + scalar_outside_cost);
    4615           18 :           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
    4616              :         }
    4617              :       else
    4618              :         min_profitable_estimate = (min_vec_niters * assumed_vf
    4619              :                                    + peel_iters_prologue
    4620              :                                    + peel_iters_epilogue);
    4621              :     }
    4622              :   else
    4623              :     {
    4624        90287 :       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
    4625        90287 :                                  * assumed_vf
    4626        90287 :                                  - vec_inside_cost * peel_iters_prologue
    4627        90287 :                                  - vec_inside_cost * peel_iters_epilogue)
    4628        90287 :                                  / ((scalar_single_iter_cost * assumed_vf)
    4629              :                                    - vec_inside_cost);
    4630              :     }
    4631       100865 :   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
    4632       100865 :   if (dump_enabled_p ())
    4633         1057 :     dump_printf_loc (MSG_NOTE, vect_location,
    4634              :                      "  Static estimate profitability threshold = %d\n",
    4635              :                      min_profitable_estimate);
    4636              : 
    4637       100865 :   *ret_min_profitable_estimate = min_profitable_estimate;
    4638              : }
    4639              : 
    4640              : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
    4641              :    vector elements (not bits) for a vector with NELT elements.  */
    4642              : static void
    4643         2293 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
    4644              :                               vec_perm_builder *sel)
    4645              : {
    4646              :   /* The encoding is a single stepped pattern.  Any wrap-around is handled
    4647              :      by vec_perm_indices.  */
    4648         2293 :   sel->new_vector (nelt, 1, 3);
    4649         9172 :   for (unsigned int i = 0; i < 3; i++)
    4650         6879 :     sel->quick_push (i + offset);
    4651         2293 : }
    4652              : 
    4653              : /* Checks whether the target supports whole-vector shifts for vectors of mode
    4654              :    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
    4655              :    it supports vec_perm_const with masks for all necessary shift amounts.  */
    4656              : static bool
    4657        13726 : have_whole_vector_shift (machine_mode mode)
    4658              : {
    4659        13726 :   if (can_implement_p (vec_shr_optab, mode))
    4660              :     return true;
    4661              : 
    4662              :   /* Variable-length vectors should be handled via the optab.  */
    4663           63 :   unsigned int nelt;
    4664          126 :   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
    4665              :     return false;
    4666              : 
    4667           63 :   vec_perm_builder sel;
    4668           63 :   vec_perm_indices indices;
    4669          315 :   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
    4670              :     {
    4671          252 :       calc_vec_perm_mask_for_shift (i, nelt, &sel);
    4672          252 :       indices.new_vector (sel, 2, nelt);
    4673          252 :       if (!can_vec_perm_const_p (mode, mode, indices, false))
    4674              :         return false;
    4675              :     }
    4676              :   return true;
    4677           63 : }
    4678              : 
    4679              : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
    4680              :    multiplication operands have differing signs and (b) we intend
    4681              :    to emulate the operation using a series of signed DOT_PROD_EXPRs.
    4682              :    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
    4683              : 
    4684              : static bool
    4685         2457 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
    4686              : {
    4687         2457 :   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
    4688         2457 :   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
    4689         2004 :   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
    4690              :     return false;
    4691              : 
    4692          825 :   tree rhs1 = gimple_assign_rhs1 (assign);
    4693          825 :   tree rhs2 = gimple_assign_rhs2 (assign);
    4694          825 :   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
    4695              :     return false;
    4696              : 
    4697          627 :   return !directly_supported_p (DOT_PROD_EXPR,
    4698              :                                 SLP_TREE_VECTYPE (slp_node),
    4699          209 :                                 SLP_TREE_VECTYPE
    4700              :                                   (SLP_TREE_CHILDREN (slp_node)[0]),
    4701          209 :                                 optab_vector_mixed_sign);
    4702              : }
    4703              : 
    4704              : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
    4705              :    functions. Design better to avoid maintenance issues.  */
    4706              : 
    4707              : /* Function vect_model_reduction_cost.
    4708              : 
    4709              :    Models cost for a reduction operation, including the vector ops
    4710              :    generated within the strip-mine loop in some cases, the initial
    4711              :    definition before the loop, and the epilogue code that must be generated.  */
    4712              : 
    4713              : static void
    4714        72216 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
    4715              :                            slp_tree node, internal_fn reduc_fn,
    4716              :                            vect_reduction_type reduction_type,
    4717              :                            int ncopies, stmt_vector_for_cost *cost_vec)
    4718              : {
    4719        72216 :   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
    4720        72216 :   tree vectype;
    4721        72216 :   machine_mode mode;
    4722        72216 :   class loop *loop = NULL;
    4723              : 
    4724        72216 :   if (loop_vinfo)
    4725        72216 :     loop = LOOP_VINFO_LOOP (loop_vinfo);
    4726              : 
    4727              :   /* Condition reductions generate two reductions in the loop.  */
    4728        72216 :   if (reduction_type == COND_REDUCTION)
    4729          324 :     ncopies *= 2;
    4730              : 
    4731        72216 :   vectype = SLP_TREE_VECTYPE (node);
    4732        72216 :   mode = TYPE_MODE (vectype);
    4733        72216 :   stmt_vec_info orig_stmt_info
    4734        72216 :     = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
    4735              : 
    4736        72216 :   gimple_match_op op;
    4737        72216 :   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
    4738            0 :     gcc_unreachable ();
    4739              : 
    4740        72216 :   if (reduction_type == EXTRACT_LAST_REDUCTION)
    4741              :     /* No extra instructions are needed in the prologue.  The loop body
    4742              :        operations are costed in vectorizable_condition.  */
    4743              :     inside_cost = 0;
    4744        72216 :   else if (reduction_type == FOLD_LEFT_REDUCTION)
    4745              :     {
    4746              :       /* No extra instructions needed in the prologue.  */
    4747         4281 :       prologue_cost = 0;
    4748              : 
    4749         4281 :       if (reduc_fn != IFN_LAST)
    4750              :         /* Count one reduction-like operation per vector.  */
    4751            0 :         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
    4752              :                                         node, 0, vect_body);
    4753              :       else
    4754              :         {
    4755              :           /* Use NCOPIES deconstructs and NELEMENTS scalar ops.  */
    4756         4281 :           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
    4757         4281 :           inside_cost = record_stmt_cost (cost_vec, ncopies,
    4758              :                                           vec_deconstruct, node, 0,
    4759              :                                           vect_body);
    4760         4281 :           inside_cost += record_stmt_cost (cost_vec, nelements,
    4761              :                                            scalar_stmt, node, 0,
    4762              :                                            vect_body);
    4763              :         }
    4764              :     }
    4765              :   else
    4766              :     {
    4767              :       /* Add in the cost of the initial definitions.  */
    4768        67935 :       int prologue_stmts;
    4769        67935 :       if (reduction_type == COND_REDUCTION)
    4770              :         /* For cond reductions we have four vectors: initial index, step,
    4771              :            initial result of the data reduction, initial value of the index
    4772              :            reduction.  */
    4773              :         prologue_stmts = 4;
    4774              :       else
    4775              :         /* We need the initial reduction value.  */
    4776        67611 :         prologue_stmts = 1;
    4777        67935 :       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
    4778              :                                          scalar_to_vec, node, 0,
    4779              :                                          vect_prologue);
    4780              :     }
    4781              : 
    4782              :   /* Determine cost of epilogue code.
    4783              : 
    4784              :      We have a reduction operator that will reduce the vector in one statement.
    4785              :      Also requires scalar extract.  */
    4786              : 
    4787        72216 :   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
    4788              :     {
    4789        72032 :       if (reduc_fn != IFN_LAST)
    4790              :         {
    4791        52443 :           if (reduction_type == COND_REDUCTION)
    4792              :             {
    4793              :               /* An EQ stmt and an COND_EXPR stmt.  */
    4794           18 :               epilogue_cost += record_stmt_cost (cost_vec, 2,
    4795              :                                                  vector_stmt, node, 0,
    4796              :                                                  vect_epilogue);
    4797              :               /* Reduction of the max index and a reduction of the found
    4798              :                  values.  */
    4799           18 :               epilogue_cost += record_stmt_cost (cost_vec, 2,
    4800              :                                                  vec_to_scalar, node, 0,
    4801              :                                                  vect_epilogue);
    4802              :               /* A broadcast of the max value.  */
    4803           18 :               epilogue_cost += record_stmt_cost (cost_vec, 1,
    4804              :                                                  scalar_to_vec, node, 0,
    4805              :                                                  vect_epilogue);
    4806              :             }
    4807              :           else
    4808              :             {
    4809        52425 :               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
    4810              :                                                  node, 0, vect_epilogue);
    4811        52425 :               epilogue_cost += record_stmt_cost (cost_vec, 1,
    4812              :                                                  vec_to_scalar, node, 0,
    4813              :                                                  vect_epilogue);
    4814              :             }
    4815              :         }
    4816        19589 :       else if (reduction_type == COND_REDUCTION)
    4817              :         {
    4818          306 :           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
    4819              :           /* Extraction of scalar elements.  */
    4820          306 :           epilogue_cost += record_stmt_cost (cost_vec, 2,
    4821              :                                              vec_deconstruct, node, 0,
    4822              :                                              vect_epilogue);
    4823              :           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
    4824          306 :           epilogue_cost += record_stmt_cost (cost_vec,
    4825          306 :                                              2 * estimated_nunits - 3,
    4826              :                                              scalar_stmt, node, 0,
    4827              :                                              vect_epilogue);
    4828              :         }
    4829        19283 :       else if (reduction_type == EXTRACT_LAST_REDUCTION
    4830        19283 :                || reduction_type == FOLD_LEFT_REDUCTION)
    4831              :         /* No extra instructions need in the epilogue.  */
    4832              :         ;
    4833              :       else
    4834              :         {
    4835        15002 :           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
    4836        15002 :           tree bitsize = TYPE_SIZE (op.type);
    4837        15002 :           int element_bitsize = tree_to_uhwi (bitsize);
    4838        15002 :           int nelements = vec_size_in_bits / element_bitsize;
    4839              : 
    4840        15002 :           if (op.code == COND_EXPR)
    4841           31 :             op.code = MAX_EXPR;
    4842              : 
    4843              :           /* We have a whole vector shift available.  */
    4844         3141 :           if (VECTOR_MODE_P (mode)
    4845        15002 :               && directly_supported_p (op.code, vectype)
    4846        26759 :               && have_whole_vector_shift (mode))
    4847              :             {
    4848              :               /* Final reduction via vector shifts and the reduction operator.
    4849              :                  Also requires scalar extract.  */
    4850        35271 :               epilogue_cost += record_stmt_cost (cost_vec,
    4851        23514 :                                                  exact_log2 (nelements) * 2,
    4852              :                                                  vector_stmt, node, 0,
    4853              :                                                  vect_epilogue);
    4854        11757 :               epilogue_cost += record_stmt_cost (cost_vec, 1,
    4855              :                                                  vec_to_scalar, node, 0,
    4856              :                                                  vect_epilogue);
    4857              :             }
    4858              :           else
    4859              :             /* Use extracts and reduction op for final reduction.  For N
    4860              :                elements, we have N extracts and N-1 reduction ops.  */
    4861         3245 :             epilogue_cost += record_stmt_cost (cost_vec,
    4862         3245 :                                                nelements + nelements - 1,
    4863              :                                                vector_stmt, node, 0,
    4864              :                                                vect_epilogue);
    4865              :         }
    4866              :     }
    4867              : 
    4868        72216 :   if (dump_enabled_p ())
    4869         3009 :     dump_printf (MSG_NOTE,
    4870              :                  "vect_model_reduction_cost: inside_cost = %d, "
    4871              :                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
    4872              :                  prologue_cost, epilogue_cost);
    4873        72216 : }
    4874              : 
    4875              : /* SEQ is a sequence of instructions that initialize the reduction
    4876              :    described by REDUC_INFO.  Emit them in the appropriate place.  */
    4877              : 
    4878              : static void
    4879          462 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
    4880              :                                 vect_reduc_info reduc_info, gimple *seq)
    4881              : {
    4882          462 :   if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
    4883              :     {
    4884              :       /* When reusing an accumulator from the main loop, we only need
    4885              :          initialization instructions if the main loop can be skipped.
    4886              :          In that case, emit the initialization instructions at the end
    4887              :          of the guard block that does the skip.  */
    4888           22 :       edge skip_edge = loop_vinfo->skip_main_loop_edge;
    4889           22 :       gcc_assert (skip_edge);
    4890           22 :       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
    4891           22 :       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
    4892              :     }
    4893              :   else
    4894              :     {
    4895              :       /* The normal case: emit the initialization instructions on the
    4896              :          preheader edge.  */
    4897          440 :       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    4898          440 :       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
    4899              :     }
    4900          462 : }
    4901              : 
    4902              : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
    4903              :    which performs a reduction involving GROUP_SIZE scalar statements.
    4904              :    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
    4905              :    is nonnull, introducing extra elements of that value will not change the
    4906              :    result.  */
    4907              : 
    4908              : static void
    4909        21799 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
    4910              :                                 vect_reduc_info reduc_info,
    4911              :                                 tree vector_type,
    4912              :                                 vec<tree> *vec_oprnds,
    4913              :                                 unsigned int number_of_vectors,
    4914              :                                 unsigned int group_size, tree neutral_op)
    4915              : {
    4916        21799 :   vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
    4917        21799 :   unsigned HOST_WIDE_INT nunits;
    4918        21799 :   unsigned j, number_of_places_left_in_vector;
    4919        21799 :   unsigned int i;
    4920              : 
    4921        43598 :   gcc_assert (group_size == initial_values.length () || neutral_op);
    4922              : 
    4923              :   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
    4924              :      created vectors. It is greater than 1 if unrolling is performed.
    4925              : 
    4926              :      For example, we have two scalar operands, s1 and s2 (e.g., group of
    4927              :      strided accesses of size two), while NUNITS is four (i.e., four scalars
    4928              :      of this type can be packed in a vector).  The output vector will contain
    4929              :      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
    4930              :      will be 2).
    4931              : 
    4932              :      If GROUP_SIZE > NUNITS, the scalars will be split into several
    4933              :      vectors containing the operands.
    4934              : 
    4935              :      For example, NUNITS is four as before, and the group size is 8
    4936              :      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
    4937              :      {s5, s6, s7, s8}.  */
    4938              : 
    4939        21799 :   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
    4940              :     nunits = group_size;
    4941              : 
    4942        21799 :   tree vector_elt_type = TREE_TYPE (vector_type);
    4943        21799 :   number_of_places_left_in_vector = nunits;
    4944        21799 :   bool constant_p = true;
    4945        21799 :   tree_vector_builder elts (vector_type, nunits, 1);
    4946        21799 :   elts.quick_grow (nunits);
    4947        21799 :   gimple_seq ctor_seq = NULL;
    4948        21799 :   if (neutral_op
    4949        43014 :       && !useless_type_conversion_p (vector_elt_type,
    4950        21215 :                                      TREE_TYPE (neutral_op)))
    4951              :     {
    4952          242 :       if (VECTOR_BOOLEAN_TYPE_P (vector_type))
    4953          221 :         neutral_op = gimple_build (&ctor_seq, COND_EXPR,
    4954              :                                    vector_elt_type,
    4955              :                                    neutral_op,
    4956              :                                    build_all_ones_cst (vector_elt_type),
    4957              :                                    build_zero_cst (vector_elt_type));
    4958              :       else
    4959           21 :         neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
    4960              :     }
    4961       204229 :   for (j = 0; j < nunits * number_of_vectors; ++j)
    4962              :     {
    4963       182430 :       tree op;
    4964       182430 :       i = j % group_size;
    4965              : 
    4966              :       /* Get the def before the loop.  In reduction chain we have only
    4967              :          one initial value.  Else we have as many as PHIs in the group.  */
    4968       182430 :       if (i >= initial_values.length () || (j > i && neutral_op))
    4969              :         op = neutral_op;
    4970              :       else
    4971              :         {
    4972        51392 :           if (!useless_type_conversion_p (vector_elt_type,
    4973        25696 :                                           TREE_TYPE (initial_values[i])))
    4974              :             {
    4975          257 :               if (VECTOR_BOOLEAN_TYPE_P (vector_type))
    4976          466 :                 initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
    4977              :                                                   vector_elt_type,
    4978          233 :                                                   initial_values[i],
    4979              :                                                   build_all_ones_cst
    4980              :                                                     (vector_elt_type),
    4981              :                                                   build_zero_cst
    4982              :                                                     (vector_elt_type));
    4983              :               else
    4984           48 :                 initial_values[i] = gimple_convert (&ctor_seq,
    4985              :                                                     vector_elt_type,
    4986           24 :                                                     initial_values[i]);
    4987              :             }
    4988        25696 :           op = initial_values[i];
    4989              :         }
    4990              : 
    4991              :       /* Create 'vect_ = {op0,op1,...,opn}'.  */
    4992       182430 :       number_of_places_left_in_vector--;
    4993       182430 :       elts[nunits - number_of_places_left_in_vector - 1] = op;
    4994       182430 :       if (!CONSTANT_CLASS_P (op))
    4995         2497 :         constant_p = false;
    4996              : 
    4997       182430 :       if (number_of_places_left_in_vector == 0)
    4998              :         {
    4999        23365 :           tree init;
    5000        46730 :           if (constant_p && !neutral_op
    5001        46449 :               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
    5002        23365 :               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
    5003              :             /* Build the vector directly from ELTS.  */
    5004        23365 :             init = gimple_build_vector (&ctor_seq, &elts);
    5005            0 :           else if (neutral_op)
    5006              :             {
    5007              :               /* Build a vector of the neutral value and shift the
    5008              :                  other elements into place.  */
    5009            0 :               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
    5010              :                                                    neutral_op);
    5011            0 :               int k = nunits;
    5012            0 :               while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
    5013              :                 k -= 1;
    5014            0 :               while (k > 0)
    5015              :                 {
    5016            0 :                   k -= 1;
    5017            0 :                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
    5018            0 :                                        vector_type, init, elts[k]);
    5019              :                 }
    5020              :             }
    5021              :           else
    5022              :             {
    5023              :               /* First time round, duplicate ELTS to fill the
    5024              :                  required number of vectors.  */
    5025            0 :               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
    5026              :                                         elts, number_of_vectors, *vec_oprnds);
    5027            0 :               break;
    5028              :             }
    5029        23365 :           vec_oprnds->quick_push (init);
    5030              : 
    5031        23365 :           number_of_places_left_in_vector = nunits;
    5032        23365 :           elts.new_vector (vector_type, nunits, 1);
    5033        23365 :           elts.quick_grow (nunits);
    5034        23365 :           constant_p = true;
    5035              :         }
    5036              :     }
    5037        21799 :   if (ctor_seq != NULL)
    5038          462 :     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
    5039        21799 : }
    5040              : 
    5041              : vect_reduc_info
    5042       161998 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
    5043              : {
    5044       161998 :   if (node->cycle_info.id == -1)
    5045              :     return NULL;
    5046       160030 :   return loop_vinfo->reduc_infos[node->cycle_info.id];
    5047              : }
    5048              : 
    5049              : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
    5050              :    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
    5051              :    return false.  */
    5052              : 
    5053              : static bool
    5054        21440 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
    5055              :                                 vect_reduc_info reduc_info, tree vectype)
    5056              : {
    5057        21440 :   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
    5058        21440 :   if (!main_loop_vinfo)
    5059              :     return false;
    5060              : 
    5061         4672 :   if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
    5062              :     return false;
    5063              : 
    5064              :   /* We are not set up to handle vector bools when they are not mapped
    5065              :      to vector integer data types.  */
    5066         4657 :   if (VECTOR_BOOLEAN_TYPE_P (vectype)
    5067         4729 :       && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
    5068              :     return false;
    5069              : 
    5070         4655 :   unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
    5071         4655 :   auto_vec<tree, 16> main_loop_results (num_phis);
    5072         4655 :   auto_vec<tree, 16> initial_values (num_phis);
    5073         4655 :   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
    5074              :     {
    5075              :       /* The epilogue loop can be entered either from the main loop or
    5076              :          from an earlier guard block.  */
    5077         4432 :       edge skip_edge = loop_vinfo->skip_main_loop_edge;
    5078        17752 :       for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
    5079              :         {
    5080              :           /* Look for:
    5081              : 
    5082              :                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
    5083              :                                     INITIAL_VALUE(guard block)>.  */
    5084         4456 :           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
    5085              : 
    5086         4456 :           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
    5087         4456 :           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
    5088              : 
    5089         4456 :           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
    5090         4456 :           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
    5091              : 
    5092         4456 :           main_loop_results.quick_push (from_main_loop);
    5093         4456 :           initial_values.quick_push (from_skip);
    5094              :         }
    5095              :     }
    5096              :   else
    5097              :     /* The main loop dominates the epilogue loop.  */
    5098          223 :     main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
    5099              : 
    5100              :   /* See if the main loop has the kind of accumulator we need.  */
    5101         4655 :   vect_reusable_accumulator *accumulator
    5102         4655 :     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
    5103         4655 :   if (!accumulator
    5104         9294 :       || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
    5105        13945 :       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
    5106              :                       VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
    5107              :     return false;
    5108              : 
    5109              :   /* Handle the case where we can reduce wider vectors to narrower ones.  */
    5110         4645 :   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
    5111         4645 :   unsigned HOST_WIDE_INT m;
    5112         4645 :   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
    5113         4645 :                             TYPE_VECTOR_SUBPARTS (vectype), &m))
    5114            0 :     return false;
    5115              :   /* Check the intermediate vector types and operations are available.  */
    5116         4645 :   tree prev_vectype = old_vectype;
    5117         4645 :   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
    5118        13561 :   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
    5119              :     {
    5120         4795 :       intermediate_nunits = exact_div (intermediate_nunits, 2);
    5121         4795 :       tree intermediate_vectype = get_related_vectype_for_scalar_type
    5122         4795 :         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
    5123         4795 :       if (!intermediate_vectype
    5124         4795 :           || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
    5125              :                                     intermediate_vectype)
    5126         9070 :           || !can_vec_extract (TYPE_MODE (prev_vectype),
    5127         4275 :                                TYPE_MODE (intermediate_vectype)))
    5128              :         return false;
    5129              :       prev_vectype = intermediate_vectype;
    5130              :     }
    5131              : 
    5132              :   /* Non-SLP reductions might apply an adjustment after the reduction
    5133              :      operation, in order to simplify the initialization of the accumulator.
    5134              :      If the epilogue loop carries on from where the main loop left off,
    5135              :      it should apply the same adjustment to the final reduction result.
    5136              : 
    5137              :      If the epilogue loop can also be entered directly (rather than via
    5138              :      the main loop), we need to be able to handle that case in the same way,
    5139              :      with the same adjustment.  (In principle we could add a PHI node
    5140              :      to select the correct adjustment, but in practice that shouldn't be
    5141              :      necessary.)  */
    5142         4121 :   tree main_adjustment
    5143         4121 :     = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
    5144         4121 :   if (loop_vinfo->main_loop_edge && main_adjustment)
    5145              :     {
    5146         3435 :       gcc_assert (num_phis == 1);
    5147         3435 :       tree initial_value = initial_values[0];
    5148              :       /* Check that we can use INITIAL_VALUE as the adjustment and
    5149              :          initialize the accumulator with a neutral value instead.  */
    5150         3435 :       if (!operand_equal_p (initial_value, main_adjustment))
    5151              :         return false;
    5152         3425 :       initial_values[0] = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
    5153              :     }
    5154         4111 :   VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
    5155         4111 :   VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
    5156         4111 :   VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
    5157         4111 :   VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
    5158         4111 :   return true;
    5159         4655 : }
    5160              : 
    5161              : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
    5162              :    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
    5163              : 
    5164              : static tree
    5165         4155 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
    5166              :                             gimple_seq *seq)
    5167              : {
    5168         4155 :   gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
    5169              :               || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
    5170              :                   == MODE_VECTOR_INT));
    5171         4155 :   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
    5172         4155 :   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
    5173         4155 :   tree stype = TREE_TYPE (vectype);
    5174         4155 :   tree new_temp = vec_def;
    5175         8453 :   while (nunits > nunits1)
    5176              :     {
    5177         4298 :       nunits /= 2;
    5178         4298 :       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
    5179         4298 :                                                            stype, nunits);
    5180         4298 :       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
    5181              : 
    5182              :       /* The target has to make sure we support lowpart/highpart
    5183              :          extraction, either via direct vector extract or through
    5184              :          an integer mode punning.  */
    5185         4298 :       tree dst1, dst2;
    5186         4298 :       gimple *epilog_stmt;
    5187         4298 :       if (convert_optab_handler (vec_extract_optab,
    5188         4298 :                                  TYPE_MODE (TREE_TYPE (new_temp)),
    5189         4298 :                                  TYPE_MODE (vectype1))
    5190              :           != CODE_FOR_nothing)
    5191              :         {
    5192              :           /* Extract sub-vectors directly once vec_extract becomes
    5193              :              a conversion optab.  */
    5194         2618 :           dst1 = make_ssa_name (vectype1);
    5195         2618 :           epilog_stmt
    5196         5236 :               = gimple_build_assign (dst1, BIT_FIELD_REF,
    5197              :                                      build3 (BIT_FIELD_REF, vectype1,
    5198         2618 :                                              new_temp, TYPE_SIZE (vectype1),
    5199              :                                              bitsize_int (0)));
    5200         2618 :           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
    5201         2618 :           dst2 =  make_ssa_name (vectype1);
    5202         2618 :           epilog_stmt
    5203         2618 :               = gimple_build_assign (dst2, BIT_FIELD_REF,
    5204              :                                      build3 (BIT_FIELD_REF, vectype1,
    5205         2618 :                                              new_temp, TYPE_SIZE (vectype1),
    5206         2618 :                                              bitsize_int (bitsize)));
    5207         2618 :           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
    5208              :         }
    5209              :       else
    5210              :         {
    5211              :           /* Extract via punning to appropriately sized integer mode
    5212              :              vector.  */
    5213         1680 :           tree eltype = build_nonstandard_integer_type (bitsize, 1);
    5214         1680 :           tree etype = build_vector_type (eltype, 2);
    5215         3360 :           gcc_assert (convert_optab_handler (vec_extract_optab,
    5216              :                                              TYPE_MODE (etype),
    5217              :                                              TYPE_MODE (eltype))
    5218              :                       != CODE_FOR_nothing);
    5219         1680 :           tree tem = make_ssa_name (etype);
    5220         1680 :           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
    5221              :                                              build1 (VIEW_CONVERT_EXPR,
    5222              :                                                      etype, new_temp));
    5223         1680 :           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
    5224         1680 :           new_temp = tem;
    5225         1680 :           tem = make_ssa_name (eltype);
    5226         1680 :           epilog_stmt
    5227         3360 :               = gimple_build_assign (tem, BIT_FIELD_REF,
    5228              :                                      build3 (BIT_FIELD_REF, eltype,
    5229         1680 :                                              new_temp, TYPE_SIZE (eltype),
    5230              :                                              bitsize_int (0)));
    5231         1680 :           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
    5232         1680 :           dst1 = make_ssa_name (vectype1);
    5233         1680 :           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
    5234              :                                              build1 (VIEW_CONVERT_EXPR,
    5235              :                                                      vectype1, tem));
    5236         1680 :           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
    5237         1680 :           tem = make_ssa_name (eltype);
    5238         1680 :           epilog_stmt
    5239         1680 :               = gimple_build_assign (tem, BIT_FIELD_REF,
    5240              :                                      build3 (BIT_FIELD_REF, eltype,
    5241         1680 :                                              new_temp, TYPE_SIZE (eltype),
    5242         1680 :                                              bitsize_int (bitsize)));
    5243         1680 :           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
    5244         1680 :           dst2 =  make_ssa_name (vectype1);
    5245         1680 :           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
    5246              :                                              build1 (VIEW_CONVERT_EXPR,
    5247              :                                                      vectype1, tem));
    5248         1680 :           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
    5249              :         }
    5250              : 
    5251         4298 :       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
    5252              :     }
    5253         4155 :   if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
    5254              :     {
    5255           66 :       tree dst3 = make_ssa_name (vectype);
    5256           66 :       gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
    5257              :                                                  build1 (VIEW_CONVERT_EXPR,
    5258              :                                                          vectype, new_temp));
    5259           66 :       gimple_seq_add_stmt_without_update (seq, epilog_stmt);
    5260           66 :       new_temp = dst3;
    5261              :     }
    5262              : 
    5263         4155 :   return new_temp;
    5264              : }
    5265              : 
    5266              : /* Function vect_create_epilog_for_reduction
    5267              : 
    5268              :    Create code at the loop-epilog to finalize the result of a reduction
    5269              :    computation.
    5270              : 
    5271              :    STMT_INFO is the scalar reduction stmt that is being vectorized.
    5272              :    SLP_NODE is an SLP node containing a group of reduction statements. The
    5273              :      first one in this group is STMT_INFO.
    5274              :    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
    5275              :    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
    5276              :      (counting from 0)
    5277              :    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
    5278              :      exit this edge is always the main loop exit.
    5279              : 
    5280              :    This function:
    5281              :    1. Completes the reduction def-use cycles.
    5282              :    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
    5283              :       by calling the function specified by REDUC_FN if available, or by
    5284              :       other means (whole-vector shifts or a scalar loop).
    5285              :       The function also creates a new phi node at the loop exit to preserve
    5286              :       loop-closed form, as illustrated below.
    5287              : 
    5288              :      The flow at the entry to this function:
    5289              : 
    5290              :         loop:
    5291              :           vec_def = phi <vec_init, null>        # REDUCTION_PHI
    5292              :           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
    5293              :           s_loop = scalar_stmt                  # (scalar) STMT_INFO
    5294              :         loop_exit:
    5295              :           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
    5296              :           use <s_out0>
    5297              :           use <s_out0>
    5298              : 
    5299              :      The above is transformed by this function into:
    5300              : 
    5301              :         loop:
    5302              :           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
    5303              :           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
    5304              :           s_loop = scalar_stmt                  # (scalar) STMT_INFO
    5305              :         loop_exit:
    5306              :           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
    5307              :           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
    5308              :           v_out2 = reduce <v_out1>
    5309              :           s_out3 = extract_field <v_out2, 0>
    5310              :           s_out4 = adjust_result <s_out3>
    5311              :           use <s_out4>
    5312              :           use <s_out4>
    5313              : */
    5314              : 
    5315              : static void
    5316        22146 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
    5317              :                                   stmt_vec_info stmt_info,
    5318              :                                   slp_tree slp_node,
    5319              :                                   slp_instance slp_node_instance,
    5320              :                                   edge loop_exit)
    5321              : {
    5322        22146 :   vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
    5323        22146 :   code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
    5324        22146 :   internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
    5325        22146 :   tree vectype;
    5326        22146 :   machine_mode mode;
    5327        22146 :   basic_block exit_bb;
    5328        22146 :   gimple *new_phi = NULL, *phi = NULL;
    5329        22146 :   gimple_stmt_iterator exit_gsi;
    5330        22146 :   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
    5331        22146 :   gimple *epilog_stmt = NULL;
    5332        22146 :   gimple *exit_phi;
    5333        22146 :   tree def;
    5334        22146 :   tree orig_name, scalar_result;
    5335        22146 :   imm_use_iterator imm_iter;
    5336        22146 :   use_operand_p use_p;
    5337        22146 :   gimple *use_stmt;
    5338        22146 :   auto_vec<tree> reduc_inputs;
    5339        22146 :   int j, i;
    5340        22146 :   vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
    5341        22146 :   unsigned int k;
    5342              :   /* SLP reduction without reduction chain, e.g.,
    5343              :      # a1 = phi <a2, a0>
    5344              :      # b1 = phi <b2, b0>
    5345              :      a2 = operation (a1)
    5346              :      b2 = operation (b1)  */
    5347        22146 :   const bool slp_reduc = !reduc_info->is_reduc_chain;
    5348        22146 :   tree induction_index = NULL_TREE;
    5349              : 
    5350        22146 :   unsigned int group_size = SLP_TREE_LANES (slp_node);
    5351              : 
    5352        22146 :   bool double_reduc = false;
    5353        22146 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    5354        22146 :   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    5355              :     {
    5356            0 :       double_reduc = true;
    5357            0 :       gcc_assert (slp_reduc);
    5358              :     }
    5359              : 
    5360        22146 :   vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
    5361        22146 :   gcc_assert (vectype);
    5362        22146 :   mode = TYPE_MODE (vectype);
    5363              : 
    5364        22146 :   tree induc_val = NULL_TREE;
    5365        22146 :   tree adjustment_def = NULL;
    5366              :   /* Optimize: for induction condition reduction, if we can't use zero
    5367              :      for induc_val, use initial_def.  */
    5368        22146 :   if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
    5369           62 :     induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
    5370        22084 :   else if (double_reduc)
    5371              :     ;
    5372              :   else
    5373        22084 :     adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
    5374              : 
    5375        22146 :   stmt_vec_info single_live_out_stmt[] = { stmt_info };
    5376        22146 :   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
    5377        22146 :   if (slp_reduc)
    5378              :     /* All statements produce live-out values.  */
    5379        43860 :     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
    5380              : 
    5381        22146 :   unsigned vec_num
    5382        22146 :     = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
    5383              : 
    5384              :   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
    5385              :      which is updated with the current index of the loop for every match of
    5386              :      the original loop's cond_expr (VEC_STMT).  This results in a vector
    5387              :      containing the last time the condition passed for that vector lane.
    5388              :      The first match will be a 1 to allow 0 to be used for non-matching
    5389              :      indexes.  If there are no matches at all then the vector will be all
    5390              :      zeroes.
    5391              : 
    5392              :      PR92772: This algorithm is broken for architectures that support
    5393              :      masked vectors, but do not provide fold_extract_last.  */
    5394        22146 :   if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
    5395              :     {
    5396           87 :       gcc_assert (!double_reduc);
    5397           87 :       auto_vec<std::pair<tree, bool>, 2> ccompares;
    5398           87 :       slp_tree cond_node = slp_node_instance->root;
    5399          183 :       while (cond_node != slp_node_instance->reduc_phis)
    5400              :         {
    5401           96 :           stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
    5402           96 :           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
    5403              :             {
    5404           96 :               gimple *vec_stmt
    5405           96 :                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
    5406           96 :               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
    5407           96 :               ccompares.safe_push
    5408           96 :                 (std::make_pair (gimple_assign_rhs1 (vec_stmt),
    5409           96 :                                  SLP_TREE_REDUC_IDX (cond_node) == 2));
    5410              :             }
    5411           96 :           int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
    5412           96 :           cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
    5413              :         }
    5414           87 :       gcc_assert (ccompares.length () != 0);
    5415              : 
    5416           87 :       tree indx_before_incr, indx_after_incr;
    5417           87 :       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
    5418           87 :       int scalar_precision
    5419           87 :         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
    5420           87 :       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
    5421           87 :       tree cr_index_vector_type = get_related_vectype_for_scalar_type
    5422           87 :         (TYPE_MODE (vectype), cr_index_scalar_type,
    5423              :          TYPE_VECTOR_SUBPARTS (vectype));
    5424              : 
    5425              :       /* First we create a simple vector induction variable which starts
    5426              :          with the values {1,2,3,...} (SERIES_VECT) and increments by the
    5427              :          vector size (STEP).  */
    5428              : 
    5429              :       /* Create a {1,2,3,...} vector.  */
    5430           87 :       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
    5431              : 
    5432              :       /* Create a vector of the step value.  */
    5433           87 :       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
    5434           87 :       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
    5435              : 
    5436              :       /* Create an induction variable.  */
    5437           87 :       gimple_stmt_iterator incr_gsi;
    5438           87 :       bool insert_after;
    5439           87 :       vect_iv_increment_position (LOOP_VINFO_MAIN_EXIT (loop_vinfo),
    5440              :                                   &incr_gsi, &insert_after);
    5441           87 :       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
    5442              :                  insert_after, &indx_before_incr, &indx_after_incr);
    5443              : 
    5444              :       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
    5445              :          filled with zeros (VEC_ZERO).  */
    5446              : 
    5447              :       /* Create a vector of 0s.  */
    5448           87 :       tree zero = build_zero_cst (cr_index_scalar_type);
    5449           87 :       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
    5450              : 
    5451              :       /* Create a vector phi node.  */
    5452           87 :       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
    5453           87 :       new_phi = create_phi_node (new_phi_tree, loop->header);
    5454           87 :       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
    5455              :                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
    5456              : 
    5457              :       /* Now take the condition from the loops original cond_exprs
    5458              :          and produce a new cond_exprs (INDEX_COND_EXPR) which for
    5459              :          every match uses values from the induction variable
    5460              :          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
    5461              :          (NEW_PHI_TREE).
    5462              :          Finally, we update the phi (NEW_PHI_TREE) to take the value of
    5463              :          the new cond_expr (INDEX_COND_EXPR).  */
    5464           87 :       gimple_seq stmts = NULL;
    5465          270 :       for (int i = ccompares.length () - 1; i != -1; --i)
    5466              :         {
    5467           96 :           tree ccompare = ccompares[i].first;
    5468           96 :           if (ccompares[i].second)
    5469           69 :             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
    5470              :                                          cr_index_vector_type,
    5471              :                                          ccompare,
    5472              :                                          indx_before_incr, new_phi_tree);
    5473              :           else
    5474           27 :             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
    5475              :                                          cr_index_vector_type,
    5476              :                                          ccompare,
    5477              :                                          new_phi_tree, indx_before_incr);
    5478              :         }
    5479           87 :       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
    5480              : 
    5481              :       /* Update the phi with the vec cond.  */
    5482           87 :       induction_index = new_phi_tree;
    5483           87 :       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
    5484              :                    loop_latch_edge (loop), UNKNOWN_LOCATION);
    5485           87 :     }
    5486              : 
    5487              :   /* 2. Create epilog code.
    5488              :         The reduction epilog code operates across the elements of the vector
    5489              :         of partial results computed by the vectorized loop.
    5490              :         The reduction epilog code consists of:
    5491              : 
    5492              :         step 1: compute the scalar result in a vector (v_out2)
    5493              :         step 2: extract the scalar result (s_out3) from the vector (v_out2)
    5494              :         step 3: adjust the scalar result (s_out3) if needed.
    5495              : 
    5496              :         Step 1 can be accomplished using one the following three schemes:
    5497              :           (scheme 1) using reduc_fn, if available.
    5498              :           (scheme 2) using whole-vector shifts, if available.
    5499              :           (scheme 3) using a scalar loop. In this case steps 1+2 above are
    5500              :                      combined.
    5501              : 
    5502              :           The overall epilog code looks like this:
    5503              : 
    5504              :           s_out0 = phi <s_loop>         # original EXIT_PHI
    5505              :           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
    5506              :           v_out2 = reduce <v_out1>              # step 1
    5507              :           s_out3 = extract_field <v_out2, 0>    # step 2
    5508              :           s_out4 = adjust_result <s_out3>       # step 3
    5509              : 
    5510              :           (step 3 is optional, and steps 1 and 2 may be combined).
    5511              :           Lastly, the uses of s_out0 are replaced by s_out4.  */
    5512              : 
    5513              : 
    5514              :   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
    5515              :          v_out1 = phi <VECT_DEF>
    5516              :          Store them in NEW_PHIS.  */
    5517              :   /* We need to reduce values in all exits.  */
    5518        22146 :   exit_bb = loop_exit->dest;
    5519        22146 :   exit_gsi = gsi_after_labels (exit_bb);
    5520        22146 :   reduc_inputs.create (vec_num);
    5521        45868 :   for (unsigned i = 0; i < vec_num; i++)
    5522              :     {
    5523        23722 :       gimple_seq stmts = NULL;
    5524        23722 :       def = vect_get_slp_vect_def (slp_node, i);
    5525        23722 :       tree new_def = copy_ssa_name (def);
    5526        23722 :       phi = create_phi_node (new_def, exit_bb);
    5527        23722 :       if (LOOP_VINFO_MAIN_EXIT (loop_vinfo) == loop_exit)
    5528        23695 :         SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
    5529              :       else
    5530              :         {
    5531           57 :           for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
    5532           30 :             SET_PHI_ARG_DEF (phi, k, def);
    5533              :         }
    5534        23722 :       new_def = gimple_convert (&stmts, vectype, new_def);
    5535        23722 :       reduc_inputs.quick_push (new_def);
    5536        23722 :       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    5537              :     }
    5538              : 
    5539              :   /* 2.2 Get the original scalar reduction variable as defined in the loop.
    5540              :          In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
    5541              :          pattern), the scalar-def is taken from the original stmt that the
    5542              :          pattern-stmt (STMT) replaces.  */
    5543              : 
    5544        22973 :   tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
    5545        22146 :   tree scalar_type = TREE_TYPE (scalar_dest);
    5546        22146 :   scalar_results.truncate (0);
    5547        22146 :   scalar_results.reserve_exact (group_size);
    5548        22146 :   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
    5549              : 
    5550              :   /* True if we should implement SLP_REDUC using native reduction operations
    5551              :      instead of scalar operations.  */
    5552        22146 :   const bool direct_slp_reduc
    5553        22146 :     = (reduc_fn != IFN_LAST
    5554        22146 :        && slp_reduc
    5555        22146 :        && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
    5556              : 
    5557              :   /* If signed overflow is undefined we might need to perform reduction
    5558              :      computations in an unsigned type.  */
    5559        22146 :   tree compute_vectype = vectype;
    5560        22146 :   if (ANY_INTEGRAL_TYPE_P (vectype)
    5561        15061 :       && TYPE_OVERFLOW_UNDEFINED (vectype)
    5562         5614 :       && code.is_tree_code ()
    5563        27760 :       && arith_code_with_undefined_signed_overflow ((tree_code) code))
    5564         4108 :     compute_vectype = unsigned_type_for (vectype);
    5565              : 
    5566              :   /* In case of reduction chain, e.g.,
    5567              :      # a1 = phi <a3, a0>
    5568              :      a2 = operation (a1)
    5569              :      a3 = operation (a2),
    5570              : 
    5571              :      we may end up with more than one vector result.  Here we reduce them
    5572              :      to one vector.
    5573              : 
    5574              :      The same is true for a SLP reduction, e.g.,
    5575              :      # a1 = phi <a2, a0>
    5576              :      # b1 = phi <b2, b0>
    5577              :      a2 = operation (a1)
    5578              :      b2 = operation (a2),
    5579              : 
    5580              :      where we can end up with more than one vector as well.  We can
    5581              :      easily accumulate vectors when the number of vector elements is
    5582              :      a multiple of the SLP group size.
    5583              : 
    5584              :      The same is true if we couldn't use a single defuse cycle.  */
    5585        22146 :   if ((!slp_reduc
    5586              :        || direct_slp_reduc
    5587              :        || (slp_reduc
    5588        22146 :            && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
    5589        44292 :       && reduc_inputs.length () > 1)
    5590              :     {
    5591          542 :       gimple_seq stmts = NULL;
    5592          542 :       tree single_input = reduc_inputs[0];
    5593          542 :       if (compute_vectype != vectype)
    5594          156 :         single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
    5595              :                                      compute_vectype, single_input);
    5596         1965 :       for (k = 1; k < reduc_inputs.length (); k++)
    5597              :         {
    5598         1423 :           tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
    5599         1423 :                                      compute_vectype, reduc_inputs[k]);
    5600         1423 :           single_input = gimple_build (&stmts, code, compute_vectype,
    5601              :                                        single_input, input);
    5602              :         }
    5603          542 :       if (compute_vectype != vectype)
    5604          156 :         single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
    5605              :                                      vectype, single_input);
    5606          542 :       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    5607              : 
    5608          542 :       reduc_inputs.truncate (0);
    5609          542 :       reduc_inputs.safe_push (single_input);
    5610              :     }
    5611              : 
    5612        22146 :   tree orig_reduc_input = reduc_inputs[0];
    5613              : 
    5614              :   /* If this loop is an epilogue loop that can be skipped after the
    5615              :      main loop, we can only share a reduction operation between the
    5616              :      main loop and the epilogue if we put it at the target of the
    5617              :      skip edge.
    5618              : 
    5619              :      We can still reuse accumulators if this check fails.  Doing so has
    5620              :      the minor(?) benefit of making the epilogue loop's scalar result
    5621              :      independent of the main loop's scalar result.  */
    5622        22146 :   bool unify_with_main_loop_p = false;
    5623        22146 :   if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
    5624         4111 :       && loop_vinfo->skip_this_loop_edge
    5625         3871 :       && single_succ_p (exit_bb)
    5626        22167 :       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
    5627              :     {
    5628           21 :       unify_with_main_loop_p = true;
    5629              : 
    5630           21 :       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
    5631           21 :       reduc_inputs[0] = make_ssa_name (vectype);
    5632           21 :       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
    5633           21 :       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
    5634              :                    UNKNOWN_LOCATION);
    5635           21 :       add_phi_arg (new_phi,
    5636           21 :                    VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
    5637              :                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
    5638           21 :       exit_gsi = gsi_after_labels (reduc_block);
    5639              :     }
    5640              : 
    5641              :   /* Shouldn't be used beyond this point.  */
    5642        22146 :   exit_bb = nullptr;
    5643              : 
    5644              :   /* If we are operating on a mask vector and do not support direct mask
    5645              :      reduction, work on a bool data vector instead of a mask vector.  */
    5646        22146 :   if (VECTOR_BOOLEAN_TYPE_P (vectype)
    5647          249 :       && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
    5648        22345 :       && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
    5649              :     {
    5650          199 :       compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
    5651          199 :       gimple_seq stmts = NULL;
    5652          406 :       for (unsigned i = 0; i < reduc_inputs.length (); ++i)
    5653          414 :         reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
    5654          207 :                                         reduc_inputs[i],
    5655              :                                         build_one_cst (vectype),
    5656              :                                         build_zero_cst (vectype));
    5657          199 :       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    5658              :     }
    5659              : 
    5660        22146 :   if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
    5661           87 :       && reduc_fn != IFN_LAST)
    5662              :     {
    5663              :       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
    5664              :          various data values where the condition matched and another vector
    5665              :          (INDUCTION_INDEX) containing all the indexes of those matches.  We
    5666              :          need to extract the last matching index (which will be the index with
    5667              :          highest value) and use this to index into the data vector.
    5668              :          For the case where there were no matches, the data vector will contain
    5669              :          all default values and the index vector will be all zeros.  */
    5670              : 
    5671              :       /* Get various versions of the type of the vector of indexes.  */
    5672           14 :       tree index_vec_type = TREE_TYPE (induction_index);
    5673           14 :       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
    5674           14 :       tree index_scalar_type = TREE_TYPE (index_vec_type);
    5675           14 :       tree index_vec_cmp_type = truth_type_for (index_vec_type);
    5676              : 
    5677              :       /* Get an unsigned integer version of the type of the data vector.  */
    5678           14 :       int scalar_precision
    5679           14 :         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
    5680           14 :       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
    5681           14 :       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
    5682              :                                                 vectype);
    5683              : 
    5684              :       /* First we need to create a vector (ZERO_VEC) of zeros and another
    5685              :          vector (MAX_INDEX_VEC) filled with the last matching index, which we
    5686              :          can create using a MAX reduction and then expanding.
    5687              :          In the case where the loop never made any matches, the max index will
    5688              :          be zero.  */
    5689              : 
    5690              :       /* Vector of {0, 0, 0,...}.  */
    5691           14 :       tree zero_vec = build_zero_cst (vectype);
    5692              : 
    5693              :       /* Find maximum value from the vector of found indexes.  */
    5694           14 :       tree max_index = make_ssa_name (index_scalar_type);
    5695           14 :       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
    5696              :                                                           1, induction_index);
    5697           14 :       gimple_call_set_lhs (max_index_stmt, max_index);
    5698           14 :       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
    5699              : 
    5700              :       /* Vector of {max_index, max_index, max_index,...}.  */
    5701           14 :       tree max_index_vec = make_ssa_name (index_vec_type);
    5702           14 :       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
    5703              :                                                       max_index);
    5704           14 :       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
    5705              :                                                         max_index_vec_rhs);
    5706           14 :       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
    5707              : 
    5708              :       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
    5709              :          with the vector (INDUCTION_INDEX) of found indexes, choosing values
    5710              :          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
    5711              :          otherwise.  Only one value should match, resulting in a vector
    5712              :          (VEC_COND) with one data value and the rest zeros.
    5713              :          In the case where the loop never made any matches, every index will
    5714              :          match, resulting in a vector with all data values (which will all be
    5715              :          the default value).  */
    5716              : 
    5717              :       /* Compare the max index vector to the vector of found indexes to find
    5718              :          the position of the max value.  */
    5719           14 :       tree vec_compare = make_ssa_name (index_vec_cmp_type);
    5720           14 :       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
    5721              :                                                       induction_index,
    5722              :                                                       max_index_vec);
    5723           14 :       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
    5724              : 
    5725              :       /* Use the compare to choose either values from the data vector or
    5726              :          zero.  */
    5727           14 :       tree vec_cond = make_ssa_name (vectype);
    5728           14 :       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
    5729              :                                                    vec_compare,
    5730           14 :                                                    reduc_inputs[0],
    5731              :                                                    zero_vec);
    5732           14 :       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
    5733              : 
    5734              :       /* Finally we need to extract the data value from the vector (VEC_COND)
    5735              :          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
    5736              :          reduction, but because this doesn't exist, we can use a MAX reduction
    5737              :          instead.  The data value might be signed or a float so we need to cast
    5738              :          it first.
    5739              :          In the case where the loop never made any matches, the data values are
    5740              :          all identical, and so will reduce down correctly.  */
    5741              : 
    5742              :       /* Make the matched data values unsigned.  */
    5743           14 :       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
    5744           14 :       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
    5745              :                                        vec_cond);
    5746           14 :       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
    5747              :                                                         VIEW_CONVERT_EXPR,
    5748              :                                                         vec_cond_cast_rhs);
    5749           14 :       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
    5750              : 
    5751              :       /* Reduce down to a scalar value.  */
    5752           14 :       tree data_reduc = make_ssa_name (scalar_type_unsigned);
    5753           14 :       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
    5754              :                                                            1, vec_cond_cast);
    5755           14 :       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
    5756           14 :       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
    5757              : 
    5758              :       /* Convert the reduced value back to the result type and set as the
    5759              :          result.  */
    5760           14 :       gimple_seq stmts = NULL;
    5761           14 :       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
    5762              :                                data_reduc);
    5763           14 :       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    5764           14 :       scalar_results.safe_push (new_temp);
    5765           14 :     }
    5766        22132 :   else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
    5767           73 :            && reduc_fn == IFN_LAST)
    5768              :     {
    5769              :       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
    5770              :          idx = 0;
    5771              :          idx_val = induction_index[0];
    5772              :          val = data_reduc[0];
    5773              :          for (idx = 0, val = init, i = 0; i < nelts; ++i)
    5774              :            if (induction_index[i] > idx_val)
    5775              :              val = data_reduc[i], idx_val = induction_index[i];
    5776              :          return val;  */
    5777              : 
    5778           73 :       tree data_eltype = TREE_TYPE (vectype);
    5779           73 :       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
    5780           73 :       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
    5781           73 :       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
    5782              :       /* Enforced by vectorizable_reduction, which ensures we have target
    5783              :          support before allowing a conditional reduction on variable-length
    5784              :          vectors.  */
    5785           73 :       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
    5786           73 :       tree idx_val = NULL_TREE, val = NULL_TREE;
    5787          469 :       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
    5788              :         {
    5789          396 :           tree old_idx_val = idx_val;
    5790          396 :           tree old_val = val;
    5791          396 :           idx_val = make_ssa_name (idx_eltype);
    5792          396 :           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
    5793              :                                              build3 (BIT_FIELD_REF, idx_eltype,
    5794              :                                                      induction_index,
    5795          396 :                                                      bitsize_int (el_size),
    5796          396 :                                                      bitsize_int (off)));
    5797          396 :           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    5798          396 :           val = make_ssa_name (data_eltype);
    5799          792 :           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
    5800              :                                              build3 (BIT_FIELD_REF,
    5801              :                                                      data_eltype,
    5802          396 :                                                      reduc_inputs[0],
    5803          396 :                                                      bitsize_int (el_size),
    5804          396 :                                                      bitsize_int (off)));
    5805          396 :           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    5806          396 :           if (off != 0)
    5807              :             {
    5808          323 :               tree new_idx_val = idx_val;
    5809          323 :               if (off != v_size - el_size)
    5810              :                 {
    5811          250 :                   new_idx_val = make_ssa_name (idx_eltype);
    5812          250 :                   epilog_stmt = gimple_build_assign (new_idx_val,
    5813              :                                                      MAX_EXPR, idx_val,
    5814              :                                                      old_idx_val);
    5815          250 :                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    5816              :                 }
    5817          323 :               tree cond = make_ssa_name (boolean_type_node);
    5818          323 :               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
    5819              :                                                  idx_val, old_idx_val);
    5820          323 :               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    5821          323 :               tree new_val = make_ssa_name (data_eltype);
    5822          323 :               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
    5823              :                                                  cond, val, old_val);
    5824          323 :               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    5825          323 :               idx_val = new_idx_val;
    5826          323 :               val = new_val;
    5827              :             }
    5828              :         }
    5829              :       /* Convert the reduced value back to the result type and set as the
    5830              :          result.  */
    5831           73 :       gimple_seq stmts = NULL;
    5832           73 :       val = gimple_convert (&stmts, scalar_type, val);
    5833           73 :       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    5834           73 :       scalar_results.safe_push (val);
    5835           73 :     }
    5836              : 
    5837              :   /* 2.3 Create the reduction code, using one of the three schemes described
    5838              :          above. In SLP we simply need to extract all the elements from the
    5839              :          vector (without reducing them), so we use scalar shifts.  */
    5840        22059 :   else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
    5841              :     {
    5842        20090 :       tree tmp;
    5843        20090 :       tree vec_elem_type;
    5844              : 
    5845              :       /* Case 1:  Create:
    5846              :          v_out2 = reduc_expr <v_out1>  */
    5847              : 
    5848        20090 :       if (dump_enabled_p ())
    5849         1517 :         dump_printf_loc (MSG_NOTE, vect_location,
    5850              :                          "Reduce using direct vector reduction.\n");
    5851              : 
    5852        20090 :       gimple_seq stmts = NULL;
    5853        20090 :       vec_elem_type = TREE_TYPE (vectype);
    5854        20090 :       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
    5855        20090 :                                vec_elem_type, reduc_inputs[0]);
    5856        20090 :       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
    5857        20090 :       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    5858              : 
    5859        20090 :       if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
    5860           62 :           && induc_val)
    5861              :         {
    5862              :           /* Earlier we set the initial value to be a vector if induc_val
    5863              :              values.  Check the result and if it is induc_val then replace
    5864              :              with the original initial value, unless induc_val is
    5865              :              the same as initial_def already.  */
    5866           60 :           tree zcompare = make_ssa_name (boolean_type_node);
    5867           60 :           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
    5868              :                                              new_temp, induc_val);
    5869           60 :           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    5870           60 :           tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
    5871           60 :           tmp = make_ssa_name (new_scalar_dest);
    5872           60 :           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
    5873              :                                              initial_def, new_temp);
    5874           60 :           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    5875           60 :           new_temp = tmp;
    5876              :         }
    5877              : 
    5878        20090 :       scalar_results.safe_push (new_temp);
    5879        20090 :     }
    5880         1782 :   else if (direct_slp_reduc)
    5881              :     {
    5882              :       /* Here we create one vector for each of the GROUP_SIZE results,
    5883              :          with the elements for other SLP statements replaced with the
    5884              :          neutral value.  We can then do a normal reduction on each vector.  */
    5885              : 
    5886              :       /* Enforced by vectorizable_reduction.  */
    5887              :       gcc_assert (reduc_inputs.length () == 1);
    5888              :       gcc_assert (pow2p_hwi (group_size));
    5889              : 
    5890              :       gimple_seq seq = NULL;
    5891              : 
    5892              :       /* Build a vector {0, 1, 2, ...}, with the same number of elements
    5893              :          and the same element size as VECTYPE.  */
    5894              :       tree index = build_index_vector (vectype, 0, 1);
    5895              :       tree index_type = TREE_TYPE (index);
    5896              :       tree index_elt_type = TREE_TYPE (index_type);
    5897              :       tree mask_type = truth_type_for (index_type);
    5898              : 
    5899              :       /* Create a vector that, for each element, identifies which of
    5900              :          the results should use it.  */
    5901              :       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
    5902              :       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
    5903              :                             build_vector_from_val (index_type, index_mask));
    5904              : 
    5905              :       /* Get a neutral vector value.  This is simply a splat of the neutral
    5906              :          scalar value if we have one, otherwise the initial scalar value
    5907              :          is itself a neutral value.  */
    5908              :       tree vector_identity = NULL_TREE;
    5909              :       tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
    5910              :                                                   NULL_TREE, false);
    5911              :       if (neutral_op)
    5912              :         vector_identity = gimple_build_vector_from_val (&seq, vectype,
    5913              :                                                         neutral_op);
    5914              :       for (unsigned int i = 0; i < group_size; ++i)
    5915              :         {
    5916              :           /* If there's no universal neutral value, we can use the
    5917              :              initial scalar value from the original PHI.  This is used
    5918              :              for MIN and MAX reduction, for example.  */
    5919              :           if (!neutral_op)
    5920              :             {
    5921              :               tree scalar_value
    5922              :                 = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
    5923              :               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
    5924              :                                              scalar_value);
    5925              :               vector_identity = gimple_build_vector_from_val (&seq, vectype,
    5926              :                                                               scalar_value);
    5927              :             }
    5928              : 
    5929              :           /* Calculate the equivalent of:
    5930              : 
    5931              :              sel[j] = (index[j] == i);
    5932              : 
    5933              :              which selects the elements of REDUC_INPUTS[0] that should
    5934              :              be included in the result.  */
    5935              :           tree compare_val = build_int_cst (index_elt_type, i);
    5936              :           compare_val = build_vector_from_val (index_type, compare_val);
    5937              :           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
    5938              :                                    index, compare_val);
    5939              : 
    5940              :           /* Calculate the equivalent of:
    5941              : 
    5942              :              vec = seq ? reduc_inputs[0] : vector_identity;
    5943              : 
    5944              :              VEC is now suitable for a full vector reduction.  */
    5945              :           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
    5946              :                                    sel, reduc_inputs[0], vector_identity);
    5947              : 
    5948              :           /* Do the reduction and convert it to the appropriate type.  */
    5949              :           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
    5950              :                                       TREE_TYPE (vectype), vec);
    5951              :           scalar = gimple_convert (&seq, scalar_type, scalar);
    5952              :           scalar_results.safe_push (scalar);
    5953              :         }
    5954              :       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
    5955              :     }
    5956              :   else
    5957              :     {
    5958         1782 :       bool reduce_with_shift;
    5959         1782 :       tree vec_temp;
    5960              : 
    5961         1782 :       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
    5962              : 
    5963              :       /* See if the target wants to do the final (shift) reduction
    5964              :          in a vector mode of smaller size and first reduce upper/lower
    5965              :          halves against each other.  */
    5966         1969 :       enum machine_mode mode1 = mode;
    5967         1969 :       tree stype = TREE_TYPE (vectype);
    5968         1969 :       if (compute_vectype != vectype)
    5969              :         {
    5970          546 :           stype = unsigned_type_for (stype);
    5971          546 :           gimple_seq stmts = NULL;
    5972         1150 :           for (unsigned i = 0; i < reduc_inputs.length (); ++i)
    5973              :             {
    5974          604 :               tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
    5975          604 :                                             compute_vectype, reduc_inputs[i]);
    5976          604 :               reduc_inputs[i] = new_temp;
    5977              :             }
    5978          546 :           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    5979              :         }
    5980         1969 :       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
    5981         1969 :       unsigned nunits1 = nunits;
    5982         1969 :       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
    5983         1969 :           && reduc_inputs.length () == 1)
    5984              :         {
    5985           41 :           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
    5986              :           /* For SLP reductions we have to make sure lanes match up, but
    5987              :              since we're doing individual element final reduction reducing
    5988              :              vector width here is even more important.
    5989              :              ???  We can also separate lanes with permutes, for the common
    5990              :              case of power-of-two group-size odd/even extracts would work.  */
    5991           41 :           if (slp_reduc && nunits != nunits1)
    5992              :             {
    5993           41 :               nunits1 = least_common_multiple (nunits1, group_size);
    5994           82 :               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
    5995              :             }
    5996              :         }
    5997         1928 :       else if (!slp_reduc
    5998         1928 :                && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
    5999            0 :         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
    6000              : 
    6001         1969 :       tree vectype1 = compute_vectype;
    6002         1969 :       if (mode1 != mode)
    6003              :         {
    6004           47 :           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
    6005           47 :                                                           stype, nunits1);
    6006              :           /* First reduce the vector to the desired vector size we should
    6007              :              do shift reduction on by combining upper and lower halves.  */
    6008           47 :           gimple_seq stmts = NULL;
    6009           47 :           new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
    6010              :                                                  code, &stmts);
    6011           47 :           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    6012           47 :           reduc_inputs[0] = new_temp;
    6013              :         }
    6014              : 
    6015         1969 :       reduce_with_shift = have_whole_vector_shift (mode1);
    6016          728 :       if (!VECTOR_MODE_P (mode1)
    6017         2695 :           || !directly_supported_p (code, vectype1))
    6018              :         reduce_with_shift = false;
    6019              : 
    6020         1952 :       if (reduce_with_shift && (!slp_reduc || group_size == 1))
    6021              :         {
    6022         1729 :           int element_bitsize = vector_element_bits (vectype1);
    6023              :           /* Enforced by vectorizable_reduction, which disallows SLP reductions
    6024              :              for variable-length vectors and also requires direct target support
    6025              :              for loop reductions.  */
    6026         1729 :           int nelements = TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
    6027         1729 :           vec_perm_builder sel;
    6028         1729 :           vec_perm_indices indices;
    6029              : 
    6030         1729 :           int elt_offset;
    6031              : 
    6032         1729 :           tree zero_vec = build_zero_cst (vectype1);
    6033              :           /* Case 2: Create:
    6034              :              for (offset = nelements/2; offset >= 1; offset/=2)
    6035              :                 {
    6036              :                   Create:  va' = vec_shift <va, offset>
    6037              :                   Create:  va = vop <va, va'>
    6038              :                 }  */
    6039              : 
    6040         1729 :           if (dump_enabled_p ())
    6041          368 :             dump_printf_loc (MSG_NOTE, vect_location,
    6042              :                              "Reduce using vector shifts\n");
    6043              : 
    6044         1729 :           gimple_seq stmts = NULL;
    6045         1729 :           new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
    6046         1729 :           for (elt_offset = nelements / 2;
    6047         3770 :                elt_offset >= 1;
    6048         2041 :                elt_offset /= 2)
    6049              :             {
    6050         2041 :               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
    6051         2041 :               indices.new_vector (sel, 2, nelements);
    6052         2041 :               tree mask = vect_gen_perm_mask_any (vectype1, indices);
    6053         2041 :               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
    6054              :                                        new_temp, zero_vec, mask);
    6055         2041 :               new_temp = gimple_build (&stmts, code,
    6056              :                                        vectype1, new_name, new_temp);
    6057              :             }
    6058              : 
    6059              :           /* 2.4  Extract the final scalar result.  Create:
    6060              :              s_out3 = extract_field <v_out2, bitpos>  */
    6061              : 
    6062         1729 :           if (dump_enabled_p ())
    6063          368 :             dump_printf_loc (MSG_NOTE, vect_location,
    6064              :                              "extract scalar result\n");
    6065              : 
    6066         1729 :           new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
    6067         1729 :                                    new_temp, bitsize_int (element_bitsize),
    6068         1729 :                                    bitsize_zero_node);
    6069         1729 :           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
    6070         1729 :           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    6071         1729 :           scalar_results.safe_push (new_temp);
    6072         1729 :         }
    6073              :       else
    6074              :         {
    6075              :           /* Case 3: Create:
    6076              :              s = extract_field <v_out2, 0>
    6077              :              for (offset = element_size;
    6078              :                   offset < vector_size;
    6079              :                   offset += element_size;)
    6080              :                {
    6081              :                  Create:  s' = extract_field <v_out2, offset>
    6082              :                  Create:  s = op <s, s'>  // For non SLP cases
    6083              :                }  */
    6084              : 
    6085          240 :           if (dump_enabled_p ())
    6086          151 :             dump_printf_loc (MSG_NOTE, vect_location,
    6087              :                              "Reduce using scalar code.\n");
    6088              : 
    6089          240 :           tree compute_type = TREE_TYPE (vectype1);
    6090          240 :           unsigned element_bitsize = vector_element_bits (vectype1);
    6091          240 :           unsigned vec_size_in_bits = element_bitsize
    6092          240 :             * TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
    6093          240 :           tree bitsize = bitsize_int (element_bitsize);
    6094          240 :           gimple_seq stmts = NULL;
    6095          633 :           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
    6096              :             {
    6097          393 :               unsigned bit_offset;
    6098          786 :               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
    6099          393 :                                        vec_temp, bitsize, bitsize_zero_node);
    6100              : 
    6101              :               /* In SLP we don't need to apply reduction operation, so we just
    6102              :                  collect s' values in SCALAR_RESULTS.  */
    6103          393 :               if (slp_reduc)
    6104          383 :                 scalar_results.safe_push (new_temp);
    6105              : 
    6106          955 :               for (bit_offset = element_bitsize;
    6107         1348 :                    bit_offset < vec_size_in_bits;
    6108          955 :                    bit_offset += element_bitsize)
    6109              :                 {
    6110          955 :                   tree bitpos = bitsize_int (bit_offset);
    6111          955 :                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
    6112              :                                            compute_type, vec_temp,
    6113              :                                            bitsize, bitpos);
    6114          955 :                   if (slp_reduc)
    6115              :                     {
    6116              :                       /* In SLP we don't need to apply reduction operation, so
    6117              :                          we just collect s' values in SCALAR_RESULTS.  */
    6118          945 :                       new_temp = new_name;
    6119          945 :                       scalar_results.safe_push (new_name);
    6120              :                     }
    6121              :                   else
    6122           10 :                     new_temp = gimple_build (&stmts, code, compute_type,
    6123              :                                              new_name, new_temp);
    6124              :                 }
    6125              :             }
    6126              : 
    6127              :           /* The only case where we need to reduce scalar results in a SLP
    6128              :              reduction, is unrolling.  If the size of SCALAR_RESULTS is
    6129              :              greater than GROUP_SIZE, we reduce them combining elements modulo
    6130              :              GROUP_SIZE.  */
    6131          240 :           if (slp_reduc)
    6132              :             {
    6133          230 :               tree res, first_res, new_res;
    6134              : 
    6135              :               /* Reduce multiple scalar results in case of SLP unrolling.  */
    6136          878 :               for (j = group_size; scalar_results.iterate (j, &res);
    6137              :                    j++)
    6138              :                 {
    6139          648 :                   first_res = scalar_results[j % group_size];
    6140          648 :                   new_res = gimple_build (&stmts, code, compute_type,
    6141              :                                           first_res, res);
    6142          648 :                   scalar_results[j % group_size] = new_res;
    6143              :                 }
    6144          230 :               scalar_results.truncate (group_size);
    6145         1140 :               for (k = 0; k < group_size; k++)
    6146         1360 :                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
    6147          680 :                                                     scalar_results[k]);
    6148              :             }
    6149              :           else
    6150              :             {
    6151              :               /* Reduction chain - we have one scalar to keep in
    6152              :                  SCALAR_RESULTS.  */
    6153           10 :               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
    6154           10 :               scalar_results.safe_push (new_temp);
    6155              :             }
    6156              : 
    6157          240 :           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    6158              :         }
    6159              : 
    6160         1969 :       if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
    6161            0 :           && induc_val)
    6162              :         {
    6163              :           /* Earlier we set the initial value to be a vector if induc_val
    6164              :              values.  Check the result and if it is induc_val then replace
    6165              :              with the original initial value, unless induc_val is
    6166              :              the same as initial_def already.  */
    6167            0 :           tree zcompare = make_ssa_name (boolean_type_node);
    6168            0 :           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
    6169            0 :                                              scalar_results[0], induc_val);
    6170            0 :           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    6171            0 :           tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
    6172            0 :           tree tmp = make_ssa_name (new_scalar_dest);
    6173            0 :           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
    6174            0 :                                              initial_def, scalar_results[0]);
    6175            0 :           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
    6176            0 :           scalar_results[0] = tmp;
    6177              :         }
    6178              :     }
    6179              : 
    6180              :   /* 2.5 Adjust the final result by the initial value of the reduction
    6181              :          variable. (When such adjustment is not needed, then
    6182              :          'adjustment_def' is zero).  For example, if code is PLUS we create:
    6183              :          new_temp = loop_exit_def + adjustment_def  */
    6184              : 
    6185        22146 :   if (adjustment_def)
    6186              :     {
    6187        15738 :       gcc_assert (!slp_reduc || group_size == 1);
    6188        15738 :       gimple_seq stmts = NULL;
    6189        15738 :       if (double_reduc)
    6190              :         {
    6191            0 :           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
    6192            0 :           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
    6193            0 :           new_temp = gimple_build (&stmts, code, vectype,
    6194            0 :                                    reduc_inputs[0], adjustment_def);
    6195              :         }
    6196              :       else
    6197              :         {
    6198        15738 :           new_temp = scalar_results[0];
    6199        15738 :           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
    6200        15738 :           adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
    6201              :                                            adjustment_def);
    6202        15738 :           new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
    6203              :                                      new_temp);
    6204        15738 :           new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
    6205              :                                    new_temp, adjustment_def);
    6206        15738 :           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
    6207              :         }
    6208              : 
    6209        15738 :       epilog_stmt = gimple_seq_last_stmt (stmts);
    6210        15738 :       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
    6211        15738 :       scalar_results[0] = new_temp;
    6212              :     }
    6213              : 
    6214              :   /* Record this operation if it could be reused by the epilogue loop.  */
    6215        22146 :   if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
    6216        22146 :       && reduc_inputs.length () == 1)
    6217        21942 :     loop_vinfo->reusable_accumulators.put (scalar_results[0],
    6218              :                                            { orig_reduc_input, reduc_info });
    6219              : 
    6220              :   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
    6221              :           phis with new adjusted scalar results, i.e., replace use <s_out0>
    6222              :           with use <s_out4>.
    6223              : 
    6224              :      Transform:
    6225              :         loop_exit:
    6226              :           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
    6227              :           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
    6228              :           v_out2 = reduce <v_out1>
    6229              :           s_out3 = extract_field <v_out2, 0>
    6230              :           s_out4 = adjust_result <s_out3>
    6231              :           use <s_out0>
    6232              :           use <s_out0>
    6233              : 
    6234              :      into:
    6235              : 
    6236              :         loop_exit:
    6237              :           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
    6238              :           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
    6239              :           v_out2 = reduce <v_out1>
    6240              :           s_out3 = extract_field <v_out2, 0>
    6241              :           s_out4 = adjust_result <s_out3>
    6242              :           use <s_out4>
    6243              :           use <s_out4> */
    6244              : 
    6245        44292 :   gcc_assert (live_out_stmts.size () == scalar_results.length ());
    6246        22146 :   auto_vec<gimple *> phis;
    6247        44742 :   for (k = 0; k < live_out_stmts.size (); k++)
    6248              :     {
    6249        22596 :       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
    6250        22596 :       tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
    6251              : 
    6252              :       /* Find the loop-closed-use at the loop exit of the original scalar
    6253              :          result.  (The reduction result is expected to have two immediate uses,
    6254              :          one at the latch block, and one at the loop exit).  Note with
    6255              :          early break we can have two exit blocks, so pick the correct PHI.  */
    6256       114467 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
    6257        69275 :         if (!is_gimple_debug (USE_STMT (use_p))
    6258        69275 :             && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
    6259              :           {
    6260        22591 :             gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
    6261        22591 :             if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
    6262        22583 :               phis.safe_push (USE_STMT (use_p));
    6263        22596 :           }
    6264              : 
    6265        45179 :       FOR_EACH_VEC_ELT (phis, i, exit_phi)
    6266              :         {
    6267              :           /* Replace the uses:  */
    6268        22583 :           orig_name = PHI_RESULT (exit_phi);
    6269              : 
    6270              :           /* Look for a single use at the target of the skip edge.  */
    6271        22583 :           if (unify_with_main_loop_p)
    6272              :             {
    6273           38 :               use_operand_p use_p;
    6274           38 :               gimple *user;
    6275           38 :               if (!single_imm_use (orig_name, &use_p, &user))
    6276            0 :                 gcc_unreachable ();
    6277           38 :               orig_name = gimple_get_lhs (user);
    6278              :             }
    6279              : 
    6280        22583 :           scalar_result = scalar_results[k];
    6281        83717 :           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
    6282              :             {
    6283        38551 :               gphi *use_phi = dyn_cast <gphi *> (use_stmt);
    6284       115697 :               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
    6285              :                 {
    6286        38573 :                   if (use_phi
    6287        38573 :                       && (phi_arg_edge_from_use (use_p)->flags & EDGE_ABNORMAL))
    6288              :                     {
    6289            0 :                       gcc_assert (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_name));
    6290            0 :                       SSA_NAME_OCCURS_IN_ABNORMAL_PHI (scalar_result) = 1;
    6291              :                     }
    6292        38573 :                   SET_USE (use_p, scalar_result);
    6293              :                 }
    6294        38551 :               update_stmt (use_stmt);
    6295        22583 :             }
    6296              :         }
    6297              : 
    6298        22596 :       phis.truncate (0);
    6299              :     }
    6300        22146 : }
    6301              : 
    6302              : /* Return a vector of type VECTYPE that is equal to the vector select
    6303              :    operation "MASK ? VEC : IDENTITY".  Insert the select statements
    6304              :    before GSI.  */
    6305              : 
    6306              : static tree
    6307            9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
    6308              :                      tree vec, tree identity)
    6309              : {
    6310            9 :   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
    6311            9 :   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
    6312              :                                           mask, vec, identity);
    6313            9 :   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
    6314            9 :   return cond;
    6315              : }
    6316              : 
    6317              : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
    6318              :    order, starting with LHS.  Insert the extraction statements before GSI and
    6319              :    associate the new scalar SSA names with variable SCALAR_DEST.
    6320              :    If MASK is nonzero mask the input and then operate on it unconditionally.
    6321              :    Return the SSA name for the result.  */
    6322              : 
    6323              : static tree
    6324         1161 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
    6325              :                        tree_code code, tree lhs, tree vector_rhs,
    6326              :                        tree mask)
    6327              : {
    6328         1161 :   tree vectype = TREE_TYPE (vector_rhs);
    6329         1161 :   tree scalar_type = TREE_TYPE (vectype);
    6330         1161 :   tree bitsize = TYPE_SIZE (scalar_type);
    6331         1161 :   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
    6332         1161 :   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
    6333              : 
    6334              :   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
    6335              :      to perform an unconditional element-wise reduction of it.  */
    6336         1161 :   if (mask)
    6337              :     {
    6338           84 :       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
    6339              :                                                    "masked_vector_rhs");
    6340           84 :       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
    6341              :                                                   false);
    6342           84 :       tree vector_identity = build_vector_from_val (vectype, neutral_op);
    6343           84 :       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
    6344              :                                              mask, vector_rhs, vector_identity);
    6345           84 :       gsi_insert_before (gsi, select, GSI_SAME_STMT);
    6346           84 :       vector_rhs = masked_vector_rhs;
    6347              :     }
    6348              : 
    6349         1161 :   for (unsigned HOST_WIDE_INT bit_offset = 0;
    6350         5307 :        bit_offset < vec_size_in_bits;
    6351         4146 :        bit_offset += element_bitsize)
    6352              :     {
    6353         4146 :       tree bitpos = bitsize_int (bit_offset);
    6354         4146 :       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
    6355              :                          bitsize, bitpos);
    6356              : 
    6357         4146 :       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
    6358         4146 :       rhs = make_ssa_name (scalar_dest, stmt);
    6359         4146 :       gimple_assign_set_lhs (stmt, rhs);
    6360         4146 :       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
    6361              :       /* Fold the vector extract, combining it with a previous reversal
    6362              :          like seen in PR90579.  */
    6363         4146 :       auto gsi2 = gsi_for_stmt (stmt);
    6364         4146 :       if (fold_stmt (&gsi2, follow_all_ssa_edges))
    6365          358 :         update_stmt (gsi_stmt  (gsi2));
    6366              : 
    6367         4146 :       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
    6368         4146 :       tree new_name = make_ssa_name (scalar_dest, stmt);
    6369         4146 :       gimple_assign_set_lhs (stmt, new_name);
    6370         4146 :       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
    6371         4146 :       lhs = new_name;
    6372              :     }
    6373         1161 :   return lhs;
    6374              : }
    6375              : 
    6376              : /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
    6377              :    type of the vector input.  */
    6378              : 
    6379              : static internal_fn
    6380         2989 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
    6381              : {
    6382         2989 :   internal_fn mask_reduc_fn;
    6383         2989 :   internal_fn mask_len_reduc_fn;
    6384              : 
    6385         2989 :   switch (reduc_fn)
    6386              :     {
    6387            0 :     case IFN_FOLD_LEFT_PLUS:
    6388            0 :       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
    6389            0 :       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
    6390            0 :       break;
    6391              : 
    6392              :     default:
    6393              :       return IFN_LAST;
    6394              :     }
    6395              : 
    6396            0 :   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
    6397              :                                       OPTIMIZE_FOR_SPEED))
    6398              :     return mask_reduc_fn;
    6399            0 :   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
    6400              :                                       OPTIMIZE_FOR_SPEED))
    6401              :     return mask_len_reduc_fn;
    6402              :   return IFN_LAST;
    6403              : }
    6404              : 
    6405              : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
    6406              :    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
    6407              :    statement.  CODE is the operation performed by STMT_INFO and OPS are
    6408              :    its scalar operands.  REDUC_INDEX is the index of the operand in
    6409              :    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
    6410              :    implements in-order reduction, or IFN_LAST if we should open-code it.
    6411              :    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
    6412              :    that should be used to control the operation in a fully-masked loop.  */
    6413              : 
    6414              : static bool
    6415          895 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
    6416              :                                stmt_vec_info stmt_info,
    6417              :                                gimple_stmt_iterator *gsi,
    6418              :                                slp_tree slp_node,
    6419              :                                code_helper code, internal_fn reduc_fn,
    6420              :                                int num_ops, tree vectype_in,
    6421              :                                int reduc_index, vec_loop_masks *masks,
    6422              :                                vec_loop_lens *lens)
    6423              : {
    6424          895 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    6425          895 :   tree vectype_out = SLP_TREE_VECTYPE (slp_node);
    6426          895 :   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
    6427              : 
    6428          895 :   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
    6429              : 
    6430          895 :   bool is_cond_op = false;
    6431          895 :   if (!code.is_tree_code ())
    6432              :     {
    6433           30 :       code = conditional_internal_fn_code (internal_fn (code));
    6434           30 :       gcc_assert (code != ERROR_MARK);
    6435              :       is_cond_op = true;
    6436              :     }
    6437              : 
    6438          895 :   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
    6439              : 
    6440          895 :   gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
    6441              :                         TYPE_VECTOR_SUBPARTS (vectype_in)));
    6442              : 
    6443              :   /* ???  We should, when transforming the cycle PHI, record the existing
    6444              :      scalar def as vector def so looking up the vector def works.  This
    6445              :      would also allow generalizing this for reduction paths of length > 1
    6446              :      and/or SLP reductions.  */
    6447          895 :   slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
    6448          895 :   stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
    6449          895 :   tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
    6450              : 
    6451              :   /* The operands either come from a binary operation or an IFN_COND operation.
    6452              :      The former is a gimple assign with binary rhs and the latter is a
    6453              :      gimple call with four arguments.  */
    6454          895 :   gcc_assert (num_ops == 2 || num_ops == 4);
    6455              : 
    6456          895 :   auto_vec<tree> vec_oprnds0, vec_opmask;
    6457          895 :   vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
    6458          895 :                                                   + (1 - reduc_index)],
    6459              :                                                   &vec_oprnds0);
    6460              :   /* For an IFN_COND_OP we also need the vector mask operand.  */
    6461          895 :   if (is_cond_op)
    6462           30 :     vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
    6463              : 
    6464              :   /* The transform below relies on preserving the original scalar PHI
    6465              :      and its latch def which we replace.  So work backwards from there.  */
    6466          895 :   tree scalar_dest
    6467          895 :     = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
    6468              :                                                      (reduc_var_def)),
    6469          895 :                                     loop_latch_edge (loop));
    6470          895 :   stmt_vec_info scalar_dest_def_info
    6471          895 :     = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
    6472          895 :   tree scalar_type = TREE_TYPE (scalar_dest);
    6473              : 
    6474          895 :   int vec_num = vec_oprnds0.length ();
    6475          895 :   tree vec_elem_type = TREE_TYPE (vectype_out);
    6476          895 :   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
    6477              : 
    6478          895 :   tree vector_identity = NULL_TREE;
    6479          895 :   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
    6480              :     {
    6481            2 :       vector_identity = build_zero_cst (vectype_out);
    6482            2 :       if (!HONOR_SIGNED_ZEROS (vectype_out))
    6483              :         ;
    6484              :       else
    6485              :         {
    6486            2 :           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
    6487            2 :           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
    6488              :                                         vector_identity);
    6489              :         }
    6490              :     }
    6491              : 
    6492          895 :   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
    6493          895 :   int i;
    6494          895 :   tree def0;
    6495         2056 :   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
    6496              :     {
    6497         1161 :       gimple *new_stmt;
    6498         1161 :       tree mask = NULL_TREE;
    6499         1161 :       tree len = NULL_TREE;
    6500         1161 :       tree bias = NULL_TREE;
    6501         1161 :       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
    6502              :         {
    6503            9 :           tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
    6504              :                                                vec_num, vectype_in, i);
    6505            9 :           if (is_cond_op)
    6506            9 :             mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
    6507            9 :                                      loop_mask, vec_opmask[i], gsi);
    6508              :           else
    6509              :             mask = loop_mask;
    6510              :         }
    6511         1152 :       else if (is_cond_op)
    6512           75 :         mask = vec_opmask[i];
    6513         1161 :       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
    6514              :         {
    6515            0 :           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
    6516              :                                    i, 1, false);
    6517            0 :           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
    6518            0 :           bias = build_int_cst (intQI_type_node, biasval);
    6519            0 :           if (!is_cond_op)
    6520            0 :             mask = build_minus_one_cst (truth_type_for (vectype_in));
    6521              :         }
    6522              : 
    6523              :       /* Handle MINUS by adding the negative.  */
    6524         1161 :       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
    6525              :         {
    6526            0 :           tree negated = make_ssa_name (vectype_out);
    6527            0 :           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
    6528            0 :           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
    6529            0 :           def0 = negated;
    6530              :         }
    6531              : 
    6532            9 :       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
    6533         1170 :           && mask && mask_reduc_fn == IFN_LAST)
    6534            9 :         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
    6535              :                                     vector_identity);
    6536              : 
    6537              :       /* On the first iteration the input is simply the scalar phi
    6538              :          result, and for subsequent iterations it is the output of
    6539              :          the preceding operation.  */
    6540         1161 :       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
    6541              :         {
    6542            0 :           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
    6543            0 :             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
    6544              :                                                    def0, mask, len, bias);
    6545            0 :           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
    6546            0 :             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
    6547              :                                                    def0, mask);
    6548              :           else
    6549            0 :             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
    6550              :                                                    def0);
    6551              :           /* For chained SLP reductions the output of the previous reduction
    6552              :              operation serves as the input of the next. For the final statement
    6553              :              the output cannot be a temporary - we reuse the original
    6554              :              scalar destination of the last statement.  */
    6555            0 :           if (i != vec_num - 1)
    6556              :             {
    6557            0 :               gimple_set_lhs (new_stmt, scalar_dest_var);
    6558            0 :               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
    6559            0 :               gimple_set_lhs (new_stmt, reduc_var);
    6560              :             }
    6561              :         }
    6562              :       else
    6563              :         {
    6564         1161 :           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
    6565              :                                              tree_code (code), reduc_var, def0,
    6566              :                                              mask);
    6567         1161 :           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
    6568              :           /* Remove the statement, so that we can use the same code paths
    6569              :              as for statements that we've just created.  */
    6570         1161 :           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
    6571         1161 :           gsi_remove (&tmp_gsi, true);
    6572              :         }
    6573              : 
    6574         1161 :       if (i == vec_num - 1)
    6575              :         {
    6576          895 :           gimple_set_lhs (new_stmt, scalar_dest);
    6577          895 :           vect_finish_replace_stmt (loop_vinfo,
    6578              :                                     scalar_dest_def_info,
    6579              :                                     new_stmt);
    6580              :         }
    6581              :       else
    6582          266 :         vect_finish_stmt_generation (loop_vinfo,
    6583              :                                      scalar_dest_def_info,
    6584              :                                      new_stmt, gsi);
    6585              : 
    6586         1161 :       slp_node->push_vec_def (new_stmt);
    6587              :     }
    6588              : 
    6589          895 :   return true;
    6590          895 : }
    6591              : 
    6592              : /* Function is_nonwrapping_integer_induction.
    6593              : 
    6594              :    Check if STMT_VINO (which is part of loop LOOP) both increments and
    6595              :    does not cause overflow.  */
    6596              : 
    6597              : static bool
    6598          408 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
    6599              : {
    6600          408 :   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
    6601          408 :   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
    6602          408 :   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
    6603          408 :   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
    6604          408 :   widest_int ni, max_loop_value, lhs_max;
    6605          408 :   wi::overflow_type overflow = wi::OVF_NONE;
    6606              : 
    6607              :   /* Make sure the loop is integer based.  */
    6608          408 :   if (TREE_CODE (base) != INTEGER_CST
    6609          109 :       || TREE_CODE (step) != INTEGER_CST)
    6610              :     return false;
    6611              : 
    6612              :   /* Check that the max size of the loop will not wrap.  */
    6613              : 
    6614          109 :   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
    6615              :     return true;
    6616              : 
    6617            8 :   if (! max_stmt_executions (loop, &ni))
    6618              :     return false;
    6619              : 
    6620            8 :   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
    6621            8 :                             &overflow);
    6622            8 :   if (overflow)
    6623              :     return false;
    6624              : 
    6625            8 :   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
    6626           16 :                             TYPE_SIGN (lhs_type), &overflow);
    6627            8 :   if (overflow)
    6628              :     return false;
    6629              : 
    6630            8 :   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
    6631            8 :           <= TYPE_PRECISION (lhs_type));
    6632          408 : }
    6633              : 
    6634              : /* Check if masking can be supported by inserting a conditional expression.
    6635              :    CODE is the code for the operation.  COND_FN is the conditional internal
    6636              :    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
    6637              : static bool
    6638         5962 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
    6639              :                          tree vectype_in)
    6640              : {
    6641         5962 :   if (cond_fn != IFN_LAST
    6642         5962 :       && direct_internal_fn_supported_p (cond_fn, vectype_in,
    6643              :                                          OPTIMIZE_FOR_SPEED))
    6644              :     return false;
    6645              : 
    6646         4221 :   if (code.is_tree_code ())
    6647         4207 :     switch (tree_code (code))
    6648              :       {
    6649              :       case DOT_PROD_EXPR:
    6650              :       case SAD_EXPR:
    6651              :         return true;
    6652              : 
    6653              :       default:
    6654              :         break;
    6655              :       }
    6656              :   return false;
    6657              : }
    6658              : 
    6659              : /* Insert a conditional expression to enable masked vectorization.  CODE is the
    6660              :    code for the operation.  VOP is the array of operands.  MASK is the loop
    6661              :    mask.  GSI is a statement iterator used to place the new conditional
    6662              :    expression.  */
    6663              : static void
    6664            4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
    6665              :                       gimple_stmt_iterator *gsi)
    6666              : {
    6667            4 :   switch (tree_code (code))
    6668              :     {
    6669            4 :     case DOT_PROD_EXPR:
    6670            4 :       {
    6671            4 :         tree vectype = TREE_TYPE (vop[1]);
    6672            4 :         tree zero = build_zero_cst (vectype);
    6673            4 :         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
    6674            4 :         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
    6675              :                                                mask, vop[1], zero);
    6676            4 :         gsi_insert_before (gsi, select, GSI_SAME_STMT);
    6677            4 :         vop[1] = masked_op1;
    6678            4 :         break;
    6679              :       }
    6680              : 
    6681            0 :     case SAD_EXPR:
    6682            0 :       {
    6683            0 :         tree vectype = TREE_TYPE (vop[1]);
    6684            0 :         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
    6685            0 :         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
    6686              :                                                mask, vop[1], vop[0]);
    6687            0 :         gsi_insert_before (gsi, select, GSI_SAME_STMT);
    6688            0 :         vop[1] = masked_op1;
    6689            0 :         break;
    6690              :       }
    6691              : 
    6692            0 :     default:
    6693            0 :       gcc_unreachable ();
    6694              :     }
    6695            4 : }
    6696              : 
    6697              : /* Given an operation with CODE in loop reduction path whose reduction PHI is
    6698              :    specified by REDUC_INFO, the operation has TYPE of scalar result, and its
    6699              :    input vectype is represented by VECTYPE_IN. The vectype of vectorized result
    6700              :    may be different from VECTYPE_IN, either in base type or vectype lanes,
    6701              :    lane-reducing operation is the case.  This function check if it is possible,
    6702              :    and how to perform partial vectorization on the operation in the context
    6703              :    of LOOP_VINFO.  */
    6704              : 
    6705              : static void
    6706         4093 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
    6707              :                                             vect_reduc_info reduc_info,
    6708              :                                             slp_tree slp_node,
    6709              :                                             code_helper code, tree type,
    6710              :                                             tree vectype_in)
    6711              : {
    6712         4093 :   enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
    6713         4093 :   internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
    6714         4093 :   internal_fn cond_fn
    6715         1157 :     = ((code.is_internal_fn ()
    6716         1157 :         && internal_fn_mask_index ((internal_fn)code) != -1)
    6717         4093 :        ? (internal_fn)code : get_conditional_internal_fn (code, type));
    6718              : 
    6719         4093 :   if (reduc_type != FOLD_LEFT_REDUCTION
    6720         3326 :       && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
    6721         7306 :       && (cond_fn == IFN_LAST
    6722         3213 :           || !direct_internal_fn_supported_p (cond_fn, vectype_in,
    6723              :                                               OPTIMIZE_FOR_SPEED)))
    6724              :     {
    6725         1999 :       if (dump_enabled_p ())
    6726           98 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    6727              :                          "can't operate on partial vectors because"
    6728              :                          " no conditional operation is available.\n");
    6729         1999 :       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    6730              :     }
    6731         2094 :   else if (reduc_type == FOLD_LEFT_REDUCTION
    6732         2094 :            && reduc_fn == IFN_LAST
    6733         2094 :            && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
    6734              :     {
    6735            0 :       if (dump_enabled_p ())
    6736            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    6737              :                         "can't operate on partial vectors because"
    6738              :                         " no conditional operation is available.\n");
    6739            0 :       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    6740              :     }
    6741         2094 :   else if (reduc_type == FOLD_LEFT_REDUCTION
    6742          767 :            && internal_fn_mask_index (reduc_fn) == -1
    6743          767 :            && FLOAT_TYPE_P (vectype_in)
    6744         2861 :            && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
    6745              :     {
    6746            0 :       if (dump_enabled_p ())
    6747            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    6748              :                          "can't operate on partial vectors because"
    6749              :                          " signed zeros cannot be preserved.\n");
    6750            0 :       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    6751              :     }
    6752              :   else
    6753              :     {
    6754         2094 :       internal_fn mask_reduc_fn
    6755         2094 :                         = get_masked_reduction_fn (reduc_fn, vectype_in);
    6756         2094 :       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
    6757         2094 :       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
    6758         2094 :       unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
    6759              : 
    6760         2094 :       if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
    6761            0 :         vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
    6762              :       else
    6763         2094 :         vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
    6764              :     }
    6765         4093 : }
    6766              : 
    6767              : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
    6768              :    the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
    6769              :    and the analysis is for slp if SLP_NODE is not NULL.
    6770              : 
    6771              :    For a lane-reducing operation, the loop reduction path that it lies in,
    6772              :    may contain normal operation, or other lane-reducing operation of different
    6773              :    input type size, an example as:
    6774              : 
    6775              :      int sum = 0;
    6776              :      for (i)
    6777              :        {
    6778              :          ...
    6779              :          sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
    6780              :          sum += w[i];                // widen-sum <vector(16) char>
    6781              :          sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
    6782              :          sum += n[i];                // normal <vector(4) int>
    6783              :          ...
    6784              :        }
    6785              : 
    6786              :    Vectorization factor is essentially determined by operation whose input
    6787              :    vectype has the most lanes ("vector(16) char" in the example), while we
    6788              :    need to choose input vectype with the least lanes ("vector(4) int" in the
    6789              :    example) to determine effective number of vector reduction PHIs.  */
    6790              : 
    6791              : bool
    6792       382206 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
    6793              :                             slp_tree slp_node, stmt_vector_for_cost *cost_vec)
    6794              : {
    6795       382206 :   gimple *stmt = stmt_info->stmt;
    6796              : 
    6797       382206 :   if (!lane_reducing_stmt_p (stmt))
    6798              :     return false;
    6799              : 
    6800          716 :   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
    6801              : 
    6802          716 :   if (!INTEGRAL_TYPE_P (type))
    6803              :     return false;
    6804              : 
    6805              :   /* Do not try to vectorize bit-precision reductions.  */
    6806          716 :   if (!type_has_mode_precision_p (type))
    6807              :     return false;
    6808              : 
    6809          716 :   vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
    6810              : 
    6811              :   /* TODO: Support lane-reducing operation that does not directly participate
    6812              :      in loop reduction.  */
    6813          716 :   if (!reduc_info)
    6814              :     return false;
    6815              : 
    6816              :   /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
    6817              :      recognized.  */
    6818          716 :   gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
    6819          716 :   gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
    6820              : 
    6821         2864 :   for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
    6822              :     {
    6823         2148 :       slp_tree slp_op;
    6824         2148 :       tree op;
    6825         2148 :       tree vectype;
    6826         2148 :       enum vect_def_type dt;
    6827              : 
    6828         2148 :       if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
    6829              :                                &slp_op, &dt, &vectype))
    6830              :         {
    6831            0 :           if (dump_enabled_p ())
    6832            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    6833              :                              "use not simple.\n");
    6834            0 :           return false;
    6835              :         }
    6836              : 
    6837         2148 :       if (!vectype)
    6838              :         {
    6839            6 :           vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
    6840              :                                                  slp_op);
    6841            6 :           if (!vectype)
    6842              :             return false;
    6843              :         }
    6844              : 
    6845         2148 :       if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
    6846              :         {
    6847            0 :           if (dump_enabled_p ())
    6848            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    6849              :                              "incompatible vector types for invariants\n");
    6850            0 :           return false;
    6851              :         }
    6852              : 
    6853         2148 :       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
    6854          716 :         continue;
    6855              : 
    6856              :       /* There should be at most one cycle def in the stmt.  */
    6857         1432 :       if (VECTORIZABLE_CYCLE_DEF (dt))
    6858              :         return false;
    6859              :     }
    6860              : 
    6861          716 :   slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
    6862          716 :   tree vectype_in = SLP_TREE_VECTYPE (node_in);
    6863          716 :   gcc_assert (vectype_in);
    6864              : 
    6865              :   /* Compute number of effective vector statements for costing.  */
    6866          716 :   unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
    6867          716 :   gcc_assert (ncopies_for_cost >= 1);
    6868              : 
    6869          716 :   if (vect_is_emulated_mixed_dot_prod (slp_node))
    6870              :     {
    6871              :       /* We need extra two invariants: one that contains the minimum signed
    6872              :          value and one that contains half of its negative.  */
    6873           15 :       int prologue_stmts = 2;
    6874           15 :       unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
    6875              :                                         scalar_to_vec, slp_node, 0,
    6876              :                                         vect_prologue);
    6877           15 :       if (dump_enabled_p ())
    6878            0 :         dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
    6879              :                      "extra prologue_cost = %d .\n", cost);
    6880              : 
    6881              :       /* Three dot-products and a subtraction.  */
    6882           15 :       ncopies_for_cost *= 4;
    6883              :     }
    6884              : 
    6885          716 :   record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
    6886              :                     0, vect_body);
    6887              : 
    6888          716 :   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
    6889              :     {
    6890          113 :       enum tree_code code = gimple_assign_rhs_code (stmt);
    6891          113 :       vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
    6892          113 :                                                   node_in, code, type,
    6893              :                                                   vectype_in);
    6894              :     }
    6895              : 
    6896              :   /* Transform via vect_transform_reduction.  */
    6897          716 :   SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
    6898          716 :   return true;
    6899              : }
    6900              : 
    6901              : /* Function vectorizable_reduction.
    6902              : 
    6903              :    Check if STMT_INFO performs a reduction operation that can be vectorized.
    6904              :    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
    6905              :    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
    6906              :    Return true if STMT_INFO is vectorizable in this way.
    6907              : 
    6908              :    This function also handles reduction idioms (patterns) that have been
    6909              :    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
    6910              :    may be of this form:
    6911              :      X = pattern_expr (arg0, arg1, ..., X)
    6912              :    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
    6913              :    sequence that had been detected and replaced by the pattern-stmt
    6914              :    (STMT_INFO).
    6915              : 
    6916              :    This function also handles reduction of condition expressions, for example:
    6917              :      for (int i = 0; i < N; i++)
    6918              :        if (a[i] < value)
    6919              :          last = a[i];
    6920              :    This is handled by vectorising the loop and creating an additional vector
    6921              :    containing the loop indexes for which "a[i] < value" was true.  In the
    6922              :    function epilogue this is reduced to a single max value and then used to
    6923              :    index into the vector of results.
    6924              : 
    6925              :    In some cases of reduction patterns, the type of the reduction variable X is
    6926              :    different than the type of the other arguments of STMT_INFO.
    6927              :    In such cases, the vectype that is used when transforming STMT_INFO into
    6928              :    a vector stmt is different than the vectype that is used to determine the
    6929              :    vectorization factor, because it consists of a different number of elements
    6930              :    than the actual number of elements that are being operated upon in parallel.
    6931              : 
    6932              :    For example, consider an accumulation of shorts into an int accumulator.
    6933              :    On some targets it's possible to vectorize this pattern operating on 8
    6934              :    shorts at a time (hence, the vectype for purposes of determining the
    6935              :    vectorization factor should be V8HI); on the other hand, the vectype that
    6936              :    is used to create the vector form is actually V4SI (the type of the result).
    6937              : 
    6938              :    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
    6939              :    indicates what is the actual level of parallelism (V8HI in the example), so
    6940              :    that the right vectorization factor would be derived.  This vectype
    6941              :    corresponds to the type of arguments to the reduction stmt, and should *NOT*
    6942              :    be used to create the vectorized stmt.  The right vectype for the vectorized
    6943              :    stmt is obtained from the type of the result X:
    6944              :       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
    6945              : 
    6946              :    This means that, contrary to "regular" reductions (or "regular" stmts in
    6947              :    general), the following equation:
    6948              :       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
    6949              :    does *NOT* necessarily hold for reduction patterns.  */
    6950              : 
    6951              : bool
    6952       381490 : vectorizable_reduction (loop_vec_info loop_vinfo,
    6953              :                         stmt_vec_info stmt_info, slp_tree slp_node,
    6954              :                         slp_instance slp_node_instance,
    6955              :                         stmt_vector_for_cost *cost_vec)
    6956              : {
    6957       381490 :   tree vectype_in = NULL_TREE;
    6958       381490 :   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
    6959       381490 :   stmt_vec_info cond_stmt_vinfo = NULL;
    6960       381490 :   int i;
    6961       381490 :   int ncopies;
    6962       381490 :   bool single_defuse_cycle = false;
    6963       381490 :   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
    6964       381490 :   tree cond_reduc_val = NULL_TREE;
    6965              : 
    6966              :   /* Make sure it was already recognized as a reduction computation.  */
    6967       381490 :   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
    6968              :       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
    6969       381490 :       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
    6970              :     return false;
    6971              : 
    6972              :   /* The reduction meta.  */
    6973        84485 :   vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
    6974              : 
    6975        84485 :   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
    6976              :     {
    6977         1490 :       gcc_assert (is_a <gphi *> (stmt_info->stmt));
    6978              :       /* We eventually need to set a vector type on invariant arguments.  */
    6979              :       unsigned j;
    6980              :       slp_tree child;
    6981         4462 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
    6982         2980 :         if (!vect_maybe_update_slp_op_vectype (child,
    6983              :                                                SLP_TREE_VECTYPE (slp_node)))
    6984              :           {
    6985            0 :             if (dump_enabled_p ())
    6986            0 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    6987              :                                "incompatible vector types for "
    6988              :                                "invariants\n");
    6989            0 :             return false;
    6990              :           }
    6991         2980 :         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
    6992         2980 :                  && !useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
    6993              :                                                 SLP_TREE_VECTYPE (child)))
    6994              :           {
    6995              :             /* With bools we can have mask and non-mask precision vectors
    6996              :                or different non-mask precisions.  while pattern recog is
    6997              :                supposed to guarantee consistency here, we do not have
    6998              :                pattern stmts for PHIs (PR123316).
    6999              :                Deal with that here instead of ICEing later.  */
    7000            8 :             if (dump_enabled_p ())
    7001            8 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7002              :                                "incompatible vector type setup from "
    7003              :                                "bool pattern detection\n");
    7004            8 :             return false;
    7005              :           }
    7006              :       /* Analysis for double-reduction is done on the outer
    7007              :          loop PHI, nested cycles have no further restrictions.  */
    7008         1482 :       SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
    7009         1482 :       return true;
    7010              :     }
    7011              : 
    7012        82995 :   if (!is_a <gphi *> (stmt_info->stmt))
    7013              :     {
    7014         7924 :       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
    7015         7924 :       SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
    7016         7924 :       return true;
    7017              :     }
    7018              : 
    7019        75071 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    7020        75071 :   stmt_vec_info phi_info = stmt_info;
    7021        75071 :   bool double_reduc = false;
    7022        75071 :   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    7023              :     {
    7024              :       /* We arrive here for both the inner loop LC PHI and the
    7025              :          outer loop PHI.  The latter is what we want to analyze the
    7026              :          reduction with.  The LC PHI is handled by vectorizable_lc_phi.  */
    7027          322 :       if (gimple_bb (stmt_info->stmt) != loop->header)
    7028            0 :         return false;
    7029              : 
    7030              :       /* Set loop and phi_info to the inner loop.  */
    7031          322 :       use_operand_p use_p;
    7032          322 :       gimple *use_stmt;
    7033          322 :       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
    7034              :                                  &use_p, &use_stmt);
    7035          322 :       gcc_assert (res);
    7036          322 :       phi_info = loop_vinfo->lookup_stmt (use_stmt);
    7037          322 :       loop = loop->inner;
    7038          322 :       double_reduc = true;
    7039              :     }
    7040              : 
    7041        75071 :   const bool reduc_chain = reduc_info->is_reduc_chain;
    7042        75071 :   slp_node_instance->reduc_phis = slp_node;
    7043              :   /* ???  We're leaving slp_node to point to the PHIs, we only
    7044              :      need it to get at the number of vector stmts which wasn't
    7045              :      yet initialized for the instance root.  */
    7046              : 
    7047              :   /* PHIs should not participate in patterns.  */
    7048        75071 :   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
    7049        75071 :   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
    7050              : 
    7051              :   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
    7052              :      and compute the reduction chain length.  Discover the real
    7053              :      reduction operation stmt on the way (slp_for_stmt_info).  */
    7054        75071 :   unsigned reduc_chain_length = 0;
    7055        75071 :   stmt_info = NULL;
    7056        75071 :   slp_tree slp_for_stmt_info = NULL;
    7057        75071 :   slp_tree vdef_slp = slp_node_instance->root;
    7058       165701 :   while (vdef_slp != slp_node)
    7059              :     {
    7060        91722 :       int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
    7061        91722 :       if (reduc_idx == -1)
    7062              :         {
    7063         1084 :           if (dump_enabled_p ())
    7064            7 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7065              :                              "reduction chain broken by patterns.\n");
    7066         1092 :           return false;
    7067              :         }
    7068        90638 :       stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
    7069        90638 :       if (is_a <gphi *> (vdef->stmt))
    7070              :         {
    7071          644 :           vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
    7072              :           /* Do not count PHIs towards the chain length.  */
    7073          644 :           continue;
    7074              :         }
    7075        89994 :       gimple_match_op op;
    7076        89994 :       if (!gimple_extract_op (vdef->stmt, &op))
    7077              :         {
    7078            0 :           if (dump_enabled_p ())
    7079            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7080              :                              "reduction chain includes unsupported"
    7081              :                              " statement type.\n");
    7082            0 :           return false;
    7083              :         }
    7084        89994 :       if (CONVERT_EXPR_CODE_P (op.code))
    7085              :         {
    7086         5238 :           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
    7087              :             {
    7088            8 :               if (dump_enabled_p ())
    7089            8 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7090              :                                  "conversion in the reduction chain.\n");
    7091            8 :               return false;
    7092              :             }
    7093         5230 :           vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
    7094              :         }
    7095              :       else
    7096              :         {
    7097              :           /* First non-conversion stmt.  */
    7098        84756 :           if (!slp_for_stmt_info)
    7099        73979 :             slp_for_stmt_info = vdef_slp;
    7100              : 
    7101        84756 :           if (lane_reducing_op_p (op.code))
    7102              :             {
    7103              :               /* The last operand of lane-reducing operation is for
    7104              :                  reduction.  */
    7105          716 :               gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
    7106              : 
    7107          716 :               slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
    7108          716 :               tree vectype_op = SLP_TREE_VECTYPE (op_node);
    7109          716 :               tree type_op = TREE_TYPE (op.ops[0]);
    7110          716 :               if (!vectype_op)
    7111              :                 {
    7112            9 :                   vectype_op = get_vectype_for_scalar_type (loop_vinfo,
    7113              :                                                             type_op);
    7114            9 :                   if (!vectype_op
    7115            9 :                       || !vect_maybe_update_slp_op_vectype (op_node,
    7116              :                                                             vectype_op))
    7117            0 :                     return false;
    7118              :                 }
    7119              : 
    7120              :               /* To accommodate lane-reducing operations of mixed input
    7121              :                  vectypes, choose input vectype with the least lanes for the
    7122              :                  reduction PHI statement, which would result in the most
    7123              :                  ncopies for vectorized reduction results.  */
    7124          716 :               if (!vectype_in
    7125          716 :                   || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
    7126           58 :                        < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
    7127          687 :                 vectype_in = vectype_op;
    7128              :             }
    7129        84040 :           else if (!vectype_in)
    7130        73292 :             vectype_in = SLP_TREE_VECTYPE (slp_node);
    7131        84756 :           vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
    7132              :         }
    7133        89986 :       reduc_chain_length++;
    7134              :     }
    7135        73979 :   if (!slp_for_stmt_info)
    7136              :     {
    7137            0 :       if (dump_enabled_p ())
    7138            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7139              :                          "only noop-conversions in the reduction chain.\n");
    7140            0 :       return false;
    7141              :     }
    7142        73979 :   stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
    7143              : 
    7144              :   /* PHIs should not participate in patterns.  */
    7145        73979 :   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
    7146              : 
    7147              :   /* 1. Is vectorizable reduction?  */
    7148              :   /* Not supportable if the reduction variable is used in the loop, unless
    7149              :      it's a reduction chain.  */
    7150        73979 :   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
    7151            0 :       && !reduc_chain)
    7152              :     return false;
    7153              : 
    7154              :   /* Reductions that are not used even in an enclosing outer-loop,
    7155              :      are expected to be "live" (used out of the loop).  */
    7156        73979 :   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
    7157            0 :       && !STMT_VINFO_LIVE_P (stmt_info))
    7158              :     return false;
    7159              : 
    7160              :   /* 2. Has this been recognized as a reduction pattern?
    7161              : 
    7162              :      Check if STMT represents a pattern that has been recognized
    7163              :      in earlier analysis stages.  For stmts that represent a pattern,
    7164              :      the STMT_VINFO_RELATED_STMT field records the last stmt in
    7165              :      the original sequence that constitutes the pattern.  */
    7166              : 
    7167        73979 :   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
    7168        73979 :   if (orig_stmt_info)
    7169              :     {
    7170         5106 :       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
    7171         5106 :       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
    7172              :     }
    7173              : 
    7174              :   /* 3. Check the operands of the operation.  The first operands are defined
    7175              :         inside the loop body. The last operand is the reduction variable,
    7176              :         which is defined by the loop-header-phi.  */
    7177              : 
    7178        73979 :   tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
    7179        73979 :   VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
    7180              : 
    7181        73979 :   gimple_match_op op;
    7182        73979 :   if (!gimple_extract_op (stmt_info->stmt, &op))
    7183            0 :     gcc_unreachable ();
    7184        73979 :   bool lane_reducing = lane_reducing_op_p (op.code);
    7185              : 
    7186        73979 :   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
    7187        22131 :       && !SCALAR_FLOAT_TYPE_P (op.type))
    7188              :     return false;
    7189              : 
    7190              :   /* Do not try to vectorize bit-precision reductions.  */
    7191        73979 :   if (!type_has_mode_precision_p (op.type)
    7192         1737 :       && op.code != BIT_AND_EXPR
    7193         1602 :       && op.code != BIT_IOR_EXPR
    7194        74455 :       && op.code != BIT_XOR_EXPR)
    7195              :     return false;
    7196              : 
    7197              :   /* Lane-reducing ops also never can be used in a SLP reduction group
    7198              :      since we'll mix lanes belonging to different reductions.  But it's
    7199              :      OK to use them in a reduction chain or when the reduction group
    7200              :      has just one element.  */
    7201        73669 :   if (lane_reducing
    7202        73669 :       && !reduc_chain
    7203          650 :       && SLP_TREE_LANES (slp_node) > 1)
    7204              :     {
    7205            0 :       if (dump_enabled_p ())
    7206            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7207              :                          "lane-reducing reduction in reduction group.\n");
    7208            0 :       return false;
    7209              :     }
    7210              : 
    7211              :   /* All uses but the last are expected to be defined in the loop.
    7212              :      The last use is the reduction variable.  In case of nested cycle this
    7213              :      assumption is not true: we use reduc_index to record the index of the
    7214              :      reduction variable.  */
    7215        73669 :   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
    7216        73669 :   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
    7217        73669 :   gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
    7218       236091 :   for (i = 0; i < (int) op.num_ops; i++)
    7219              :     {
    7220              :       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
    7221       162422 :       if (i == 0 && op.code == COND_EXPR)
    7222        81292 :         continue;
    7223              : 
    7224       161568 :       stmt_vec_info def_stmt_info;
    7225       161568 :       enum vect_def_type dt;
    7226       161568 :       if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
    7227              :                                i, &op.ops[i], &slp_op[i], &dt,
    7228       161568 :                                &vectype_op[i], &def_stmt_info))
    7229              :         {
    7230            0 :           if (dump_enabled_p ())
    7231            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7232              :                              "use not simple.\n");
    7233            0 :           return false;
    7234              :         }
    7235              : 
    7236              :       /* Skip reduction operands, and for an IFN_COND_OP we might hit the
    7237              :          reduction operand twice (once as definition, once as else).  */
    7238       161568 :       if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
    7239       323136 :           == SLP_TREE_CHILDREN
    7240       161568 :                (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
    7241        80438 :         continue;
    7242              : 
    7243              :       /* There should be only one cycle def in the stmt, the one
    7244              :          leading to reduc_def.  */
    7245        81130 :       if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
    7246              :         return false;
    7247              : 
    7248        81130 :       if (!vectype_op[i])
    7249         7399 :         vectype_op[i]
    7250         7399 :           = get_vectype_for_scalar_type (loop_vinfo,
    7251         7399 :                                          TREE_TYPE (op.ops[i]), slp_op[i]);
    7252              : 
    7253              :       /* Record how the non-reduction-def value of COND_EXPR is defined.
    7254              :          ???  For a chain of multiple CONDs we'd have to match them up all.  */
    7255        81130 :       if (op.code == COND_EXPR && reduc_chain_length == 1)
    7256              :         {
    7257          831 :           if (dt == vect_constant_def)
    7258              :             {
    7259          118 :               cond_reduc_dt = dt;
    7260          118 :               cond_reduc_val = op.ops[i];
    7261              :             }
    7262          713 :           else if (dt == vect_induction_def
    7263          408 :                    && def_stmt_info
    7264         1121 :                    && is_nonwrapping_integer_induction (def_stmt_info, loop))
    7265              :             {
    7266          109 :               cond_reduc_dt = dt;
    7267          109 :               cond_stmt_vinfo = def_stmt_info;
    7268              :             }
    7269              :         }
    7270              :     }
    7271              : 
    7272        73669 :   enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
    7273              :   /* If we have a condition reduction, see if we can simplify it further.  */
    7274        73669 :   if (reduction_type == COND_REDUCTION)
    7275              :     {
    7276          842 :       if (SLP_TREE_LANES (slp_node) != 1)
    7277              :         return false;
    7278              : 
    7279              :       /* When the condition uses the reduction value in the condition, fail.  */
    7280          818 :       if (SLP_TREE_REDUC_IDX (slp_node) == 0)
    7281              :         {
    7282            0 :           if (dump_enabled_p ())
    7283            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7284              :                              "condition depends on previous iteration\n");
    7285            0 :           return false;
    7286              :         }
    7287              : 
    7288          818 :       if (reduc_chain_length == 1
    7289          818 :           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
    7290              :                                               OPTIMIZE_FOR_SPEED)
    7291          795 :               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
    7292              :                                                  vectype_in,
    7293              :                                                  OPTIMIZE_FOR_SPEED)))
    7294              :         {
    7295            0 :           if (dump_enabled_p ())
    7296            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7297              :                              "optimizing condition reduction with"
    7298              :                              " FOLD_EXTRACT_LAST.\n");
    7299            0 :           VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
    7300              :         }
    7301          818 :       else if (cond_reduc_dt == vect_induction_def)
    7302              :         {
    7303          109 :           tree base
    7304              :             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
    7305          109 :           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
    7306              : 
    7307          109 :           gcc_assert (TREE_CODE (base) == INTEGER_CST
    7308              :                       && TREE_CODE (step) == INTEGER_CST);
    7309          109 :           cond_reduc_val = NULL_TREE;
    7310          109 :           enum tree_code cond_reduc_op_code = ERROR_MARK;
    7311          109 :           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
    7312          109 :           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
    7313              :             ;
    7314              :           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
    7315              :              above base; punt if base is the minimum value of the type for
    7316              :              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
    7317           97 :           else if (tree_int_cst_sgn (step) == -1)
    7318              :             {
    7319           18 :               cond_reduc_op_code = MIN_EXPR;
    7320           18 :               if (tree_int_cst_sgn (base) == -1)
    7321            0 :                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
    7322           18 :               else if (tree_int_cst_lt (base,
    7323           18 :                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
    7324           18 :                 cond_reduc_val
    7325           18 :                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
    7326              :             }
    7327              :           else
    7328              :             {
    7329           79 :               cond_reduc_op_code = MAX_EXPR;
    7330           79 :               if (tree_int_cst_sgn (base) == 1)
    7331            0 :                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
    7332           79 :               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
    7333              :                                         base))
    7334           79 :                 cond_reduc_val
    7335           79 :                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
    7336              :             }
    7337           97 :           if (cond_reduc_val)
    7338              :             {
    7339           97 :               if (dump_enabled_p ())
    7340           61 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7341              :                                  "condition expression based on "
    7342              :                                  "integer induction.\n");
    7343           97 :               VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
    7344           97 :               VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
    7345           97 :                 = cond_reduc_val;
    7346           97 :               VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
    7347              :             }
    7348              :         }
    7349          709 :       else if (cond_reduc_dt == vect_constant_def)
    7350              :         {
    7351          108 :           enum vect_def_type cond_initial_dt;
    7352          108 :           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
    7353          108 :           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
    7354          108 :           if (cond_initial_dt == vect_constant_def
    7355          133 :               && types_compatible_p (TREE_TYPE (cond_initial_val),
    7356           25 :                                      TREE_TYPE (cond_reduc_val)))
    7357              :             {
    7358           25 :               tree e = fold_binary (LE_EXPR, boolean_type_node,
    7359              :                                     cond_initial_val, cond_reduc_val);
    7360           25 :               if (e && (integer_onep (e) || integer_zerop (e)))
    7361              :                 {
    7362           25 :                   if (dump_enabled_p ())
    7363           16 :                     dump_printf_loc (MSG_NOTE, vect_location,
    7364              :                                      "condition expression based on "
    7365              :                                      "compile time constant.\n");
    7366              :                   /* Record reduction code at analysis stage.  */
    7367           25 :                   VECT_REDUC_INFO_CODE (reduc_info)
    7368           25 :                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
    7369           25 :                   VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
    7370              :                 }
    7371              :             }
    7372              :         }
    7373              :     }
    7374              : 
    7375        73645 :   if (STMT_VINFO_LIVE_P (phi_info))
    7376              :     return false;
    7377              : 
    7378        73645 :   ncopies = vect_get_num_copies (loop_vinfo, slp_node);
    7379              : 
    7380        73645 :   gcc_assert (ncopies >= 1);
    7381              : 
    7382        73645 :   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
    7383              : 
    7384              :   /* 4.2. Check support for the epilog operation.
    7385              : 
    7386              :           If STMT represents a reduction pattern, then the type of the
    7387              :           reduction variable may be different than the type of the rest
    7388              :           of the arguments.  For example, consider the case of accumulation
    7389              :           of shorts into an int accumulator; The original code:
    7390              :                         S1: int_a = (int) short_a;
    7391              :           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
    7392              : 
    7393              :           was replaced with:
    7394              :                         STMT: int_acc = widen_sum <short_a, int_acc>
    7395              : 
    7396              :           This means that:
    7397              :           1. The tree-code that is used to create the vector operation in the
    7398              :              epilog code (that reduces the partial results) is not the
    7399              :              tree-code of STMT, but is rather the tree-code of the original
    7400              :              stmt from the pattern that STMT is replacing.  I.e, in the example
    7401              :              above we want to use 'widen_sum' in the loop, but 'plus' in the
    7402              :              epilog.
    7403              :           2. The type (mode) we use to check available target support
    7404              :              for the vector operation to be created in the *epilog*, is
    7405              :              determined by the type of the reduction variable (in the example
    7406              :              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
    7407              :              However the type (mode) we use to check available target support
    7408              :              for the vector operation to be created *inside the loop*, is
    7409              :              determined by the type of the other arguments to STMT (in the
    7410              :              example we'd check this: optab_handler (widen_sum_optab,
    7411              :              vect_short_mode)).
    7412              : 
    7413              :           This is contrary to "regular" reductions, in which the types of all
    7414              :           the arguments are the same as the type of the reduction variable.
    7415              :           For "regular" reductions we can therefore use the same vector type
    7416              :           (and also the same tree-code) when generating the epilog code and
    7417              :           when generating the code inside the loop.  */
    7418              : 
    7419        73645 :   code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
    7420              : 
    7421              :   /* If conversion might have created a conditional operation like
    7422              :      IFN_COND_ADD already.  Use the internal code for the following checks.  */
    7423        73645 :   if (orig_code.is_internal_fn ())
    7424              :     {
    7425         6837 :       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
    7426         6837 :       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
    7427              :     }
    7428              : 
    7429        73645 :   VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
    7430              : 
    7431        73645 :   reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
    7432        73645 :   if (reduction_type == TREE_CODE_REDUCTION)
    7433              :     {
    7434              :       /* Check whether it's ok to change the order of the computation.
    7435              :          Generally, when vectorizing a reduction we change the order of the
    7436              :          computation.  This may change the behavior of the program in some
    7437              :          cases, so we need to check that this is ok.  One exception is when
    7438              :          vectorizing an outer-loop: the inner-loop is executed sequentially,
    7439              :          and therefore vectorizing reductions in the inner-loop during
    7440              :          outer-loop vectorization is safe.  Likewise when we are vectorizing
    7441              :          a series of reductions using SLP and the VF is one the reductions
    7442              :          are performed in scalar order.  */
    7443        72827 :       if (!reduc_chain
    7444        72827 :           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
    7445              :         ;
    7446        72670 :       else if (needs_fold_left_reduction_p (op.type, orig_code))
    7447              :         {
    7448              :           /* When vectorizing a reduction chain w/o SLP the reduction PHI
    7449              :              is not directly used in stmt.  */
    7450         5174 :           if (reduc_chain_length != 1)
    7451              :             {
    7452           73 :               if (dump_enabled_p ())
    7453           12 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7454              :                                  "in-order reduction chain without SLP.\n");
    7455           73 :               return false;
    7456              :             }
    7457              :           /* Code generation doesn't support function calls other
    7458              :              than .COND_*.  */
    7459         5101 :           if (!op.code.is_tree_code ()
    7460         5309 :               && !(op.code.is_internal_fn ()
    7461          104 :                    && conditional_internal_fn_code (internal_fn (op.code))
    7462              :                         != ERROR_MARK))
    7463              :             {
    7464           18 :               if (dump_enabled_p ())
    7465           16 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7466              :                                  "in-order reduction chain operation not "
    7467              :                                  "supported.\n");
    7468           18 :               return false;
    7469              :             }
    7470         5083 :           VECT_REDUC_INFO_TYPE (reduc_info)
    7471         5083 :             = reduction_type = FOLD_LEFT_REDUCTION;
    7472              :         }
    7473        67496 :       else if (!commutative_binary_op_p (orig_code, op.type)
    7474        67496 :                || !associative_binary_op_p (orig_code, op.type))
    7475              :         {
    7476          144 :           if (dump_enabled_p ())
    7477           20 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7478              :                             "reduction: not commutative/associative\n");
    7479          144 :           return false;
    7480              :         }
    7481              :     }
    7482              : 
    7483         5083 :   if ((reduction_type == COND_REDUCTION
    7484              :        || reduction_type == INTEGER_INDUC_COND_REDUCTION
    7485              :        || reduction_type == CONST_COND_REDUCTION
    7486        68327 :        || reduction_type == EXTRACT_LAST_REDUCTION)
    7487          818 :       && ncopies > 1)
    7488              :     {
    7489          276 :       if (dump_enabled_p ())
    7490           60 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7491              :                          "multiple types in condition reduction.\n");
    7492          276 :       return false;
    7493              :     }
    7494              : 
    7495              :   /* See if we can convert a mask vector to a corresponding bool data vector
    7496              :      to perform the epilogue reduction.  */
    7497        73134 :   tree alt_vectype_out = NULL_TREE;
    7498        73134 :   if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
    7499              :     {
    7500         1141 :       alt_vectype_out
    7501         2282 :         = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
    7502         1141 :                                                TREE_TYPE (vectype_out),
    7503              :                                                TYPE_VECTOR_SUBPARTS
    7504              :                                                  (vectype_out));
    7505         1141 :       if (!alt_vectype_out
    7506         1141 :           || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
    7507         2255 :                        TYPE_VECTOR_SUBPARTS (vectype_out))
    7508         2282 :           || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
    7509           27 :         alt_vectype_out = NULL_TREE;
    7510              :     }
    7511              : 
    7512        73134 :   internal_fn reduc_fn = IFN_LAST;
    7513        73134 :   if (reduction_type == TREE_CODE_REDUCTION
    7514        73134 :       || reduction_type == FOLD_LEFT_REDUCTION
    7515              :       || reduction_type == INTEGER_INDUC_COND_REDUCTION
    7516          542 :       || reduction_type == CONST_COND_REDUCTION)
    7517              :     {
    7518        67623 :       if (reduction_type == FOLD_LEFT_REDUCTION
    7519        77003 :           ? fold_left_reduction_fn (orig_code, &reduc_fn)
    7520        67623 :           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
    7521              :         {
    7522        72034 :           internal_fn sbool_fn = IFN_LAST;
    7523        72034 :           if (reduc_fn == IFN_LAST)
    7524              :             ;
    7525        69996 :           else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
    7526         1141 :                     || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
    7527              :                         == MODE_VECTOR_BOOL))
    7528       138851 :                    && direct_internal_fn_supported_p (reduc_fn, vectype_out,
    7529              :                                                       OPTIMIZE_FOR_SPEED))
    7530              :             ;
    7531        18386 :           else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
    7532         1141 :                    && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
    7533        19527 :                    && direct_internal_fn_supported_p (sbool_fn, vectype_out,
    7534              :                                                       OPTIMIZE_FOR_SPEED))
    7535          125 :             reduc_fn = sbool_fn;
    7536        18261 :           else if (reduction_type != FOLD_LEFT_REDUCTION
    7537        18261 :                    && alt_vectype_out
    7538        18261 :                    && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
    7539              :                                                       OPTIMIZE_FOR_SPEED))
    7540          795 :             VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
    7541              :           else
    7542              :             {
    7543        17466 :               if (dump_enabled_p ())
    7544          942 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7545              :                                  "reduc op not supported by target.\n");
    7546              : 
    7547        17466 :               reduc_fn = IFN_LAST;
    7548              :             }
    7549              :         }
    7550              :       else
    7551              :         {
    7552          672 :           if (dump_enabled_p ())
    7553           48 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7554              :                              "no reduc code for scalar code.\n");
    7555              : 
    7556          672 :           return false;
    7557              :         }
    7558        72034 :       if (reduc_fn == IFN_LAST
    7559        72034 :           && VECTOR_BOOLEAN_TYPE_P (vectype_out))
    7560              :         {
    7561          221 :           if (!alt_vectype_out)
    7562              :             {
    7563           12 :               if (dump_enabled_p ())
    7564            8 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7565              :                                  "cannot turn mask into bool data vector for "
    7566              :                                  "reduction epilogue.\n");
    7567           12 :               return false;
    7568              :             }
    7569          209 :           VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
    7570              :         }
    7571              :     }
    7572          428 :   else if (reduction_type == COND_REDUCTION)
    7573              :     {
    7574          428 :       int scalar_precision
    7575          428 :         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
    7576          428 :       cr_index_scalar_type = make_unsigned_type (scalar_precision);
    7577          428 :       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
    7578              :                                                 vectype_out);
    7579              : 
    7580          428 :       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
    7581              :                                           OPTIMIZE_FOR_SPEED))
    7582           22 :         reduc_fn = IFN_REDUC_MAX;
    7583              :     }
    7584        72450 :   VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
    7585              : 
    7586        72450 :   if (reduction_type != EXTRACT_LAST_REDUCTION
    7587              :       && reduc_fn == IFN_LAST
    7588              :       && !nunits_out.is_constant ())
    7589              :     {
    7590              :       if (dump_enabled_p ())
    7591              :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7592              :                          "missing target support for reduction on"
    7593              :                          " variable-length vectors.\n");
    7594              :       return false;
    7595              :     }
    7596              : 
    7597              :   /* For SLP reductions, see if there is a neutral value we can use.  */
    7598        72450 :   tree neutral_op = NULL_TREE;
    7599        72450 :   tree initial_value = NULL_TREE;
    7600        72450 :   if (reduc_chain)
    7601         2240 :     initial_value = vect_phi_initial_value (reduc_def_phi);
    7602        72450 :   neutral_op = neutral_op_for_reduction (TREE_TYPE
    7603              :                                            (gimple_phi_result (reduc_def_phi)),
    7604              :                                          orig_code, initial_value);
    7605        72450 :   VECT_REDUC_INFO_NEUTRAL_OP (reduc_info) = neutral_op;
    7606              : 
    7607        72450 :   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
    7608              :     {
    7609              :       /* We can't support in-order reductions of code such as this:
    7610              : 
    7611              :            for (int i = 0; i < n1; ++i)
    7612              :              for (int j = 0; j < n2; ++j)
    7613              :                l += a[j];
    7614              : 
    7615              :          since GCC effectively transforms the loop when vectorizing:
    7616              : 
    7617              :            for (int i = 0; i < n1 / VF; ++i)
    7618              :              for (int j = 0; j < n2; ++j)
    7619              :                for (int k = 0; k < VF; ++k)
    7620              :                  l += a[j];
    7621              : 
    7622              :          which is a reassociation of the original operation.  */
    7623           66 :       if (dump_enabled_p ())
    7624           20 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7625              :                          "in-order double reduction not supported.\n");
    7626              : 
    7627           66 :       return false;
    7628              :     }
    7629              : 
    7630        72384 :   if (reduction_type == FOLD_LEFT_REDUCTION
    7631         4345 :       && SLP_TREE_LANES (slp_node) > 1
    7632          159 :       && !reduc_chain)
    7633              :     {
    7634              :       /* We cannot use in-order reductions in this case because there is
    7635              :          an implicit reassociation of the operations involved.  */
    7636           64 :       if (dump_enabled_p ())
    7637            8 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7638              :                          "in-order unchained SLP reductions not supported.\n");
    7639           64 :       return false;
    7640              :     }
    7641              : 
    7642              :   /* For double reductions, and for SLP reductions with a neutral value,
    7643              :      we construct a variable-length initial vector by loading a vector
    7644              :      full of the neutral value and then shift-and-inserting the start
    7645              :      values into the low-numbered elements.  This is however not needed
    7646              :      when neutral and initial value are equal or we can handle the
    7647              :      initial value via adjustment in the epilogue.  */
    7648        72320 :   if ((double_reduc || neutral_op)
    7649              :       && !nunits_out.is_constant ()
    7650              :       && reduction_type != INTEGER_INDUC_COND_REDUCTION
    7651              :       && !((SLP_TREE_LANES (slp_node) == 1 || reduc_chain)
    7652              :            && neutral_op
    7653              :            && (!double_reduc
    7654              :                || operand_equal_p (neutral_op,
    7655              :                                    vect_phi_initial_value (reduc_def_phi))))
    7656              :       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
    7657              :                                           vectype_out, OPTIMIZE_FOR_BOTH))
    7658              :     {
    7659              :       if (dump_enabled_p ())
    7660              :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7661              :                          "reduction on variable-length vectors requires"
    7662              :                          " target support for a vector-shift-and-insert"
    7663              :                          " operation.\n");
    7664              :       return false;
    7665              :     }
    7666              : 
    7667              :   /* Check extra constraints for variable-length unchained SLP reductions.  */
    7668        72320 :   if (!reduc_chain
    7669              :       && !nunits_out.is_constant ())
    7670              :     {
    7671              :       /* We checked above that we could build the initial vector when
    7672              :          there's a neutral element value.  Check here for the case in
    7673              :          which each SLP statement has its own initial value and in which
    7674              :          that value needs to be repeated for every instance of the
    7675              :          statement within the initial vector.  */
    7676              :       unsigned int group_size = SLP_TREE_LANES (slp_node);
    7677              :       if (!neutral_op
    7678              :           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
    7679              :                                               TREE_TYPE (vectype_out)))
    7680              :         {
    7681              :           if (dump_enabled_p ())
    7682              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7683              :                              "unsupported form of SLP reduction for"
    7684              :                              " variable-length vectors: cannot build"
    7685              :                              " initial vector.\n");
    7686              :           return false;
    7687              :         }
    7688              :       /* The epilogue code relies on the number of elements being a multiple
    7689              :          of the group size.  The duplicate-and-interleave approach to setting
    7690              :          up the initial vector does too.  */
    7691              :       if (!multiple_p (nunits_out, group_size))
    7692              :         {
    7693              :           if (dump_enabled_p ())
    7694              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7695              :                              "unsupported form of SLP reduction for"
    7696              :                              " variable-length vectors: the vector size"
    7697              :                              " is not a multiple of the number of results.\n");
    7698              :           return false;
    7699              :         }
    7700              :     }
    7701              : 
    7702        72320 :   if (reduction_type == COND_REDUCTION)
    7703              :     {
    7704          428 :       widest_int ni;
    7705              : 
    7706          428 :       if (! max_loop_iterations (loop, &ni))
    7707              :         {
    7708           14 :           if (dump_enabled_p ())
    7709            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    7710              :                              "loop count not known, cannot create cond "
    7711              :                              "reduction.\n");
    7712           14 :           return false;
    7713              :         }
    7714              :       /* Convert backedges to iterations.  */
    7715          414 :       ni += 1;
    7716              : 
    7717              :       /* The additional index will be the same type as the condition.  Check
    7718              :          that the loop can fit into this less one (because we'll use up the
    7719              :          zero slot for when there are no matches).  */
    7720          414 :       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
    7721          414 :       if (wi::geu_p (ni, wi::to_widest (max_index)))
    7722              :         {
    7723           90 :           if (dump_enabled_p ())
    7724           54 :             dump_printf_loc (MSG_NOTE, vect_location,
    7725              :                              "loop size is greater than data size.\n");
    7726           90 :           return false;
    7727              :         }
    7728          428 :     }
    7729              : 
    7730              :   /* In case the vectorization factor (VF) is bigger than the number
    7731              :      of elements that we can fit in a vectype (nunits), we have to generate
    7732              :      more than one vector stmt - i.e - we need to "unroll" the
    7733              :      vector stmt by a factor VF/nunits.  For more details see documentation
    7734              :      in vectorizable_operation.  */
    7735              : 
    7736              :   /* If the reduction is used in an outer loop we need to generate
    7737              :      VF intermediate results, like so (e.g. for ncopies=2):
    7738              :         r0 = phi (init, r0)
    7739              :         r1 = phi (init, r1)
    7740              :         r0 = x0 + r0;
    7741              :         r1 = x1 + r1;
    7742              :     (i.e. we generate VF results in 2 registers).
    7743              :     In this case we have a separate def-use cycle for each copy, and therefore
    7744              :     for each copy we get the vector def for the reduction variable from the
    7745              :     respective phi node created for this copy.
    7746              : 
    7747              :     Otherwise (the reduction is unused in the loop nest), we can combine
    7748              :     together intermediate results, like so (e.g. for ncopies=2):
    7749              :         r = phi (init, r)
    7750              :         r = x0 + r;
    7751              :         r = x1 + r;
    7752              :    (i.e. we generate VF/2 results in a single register).
    7753              :    In this case for each copy we get the vector def for the reduction variable
    7754              :    from the vectorized reduction operation generated in the previous iteration.
    7755              : 
    7756              :    This only works when we see both the reduction PHI and its only consumer
    7757              :    in vectorizable_reduction and there are no intermediate stmts
    7758              :    participating.  When unrolling we want each unrolled iteration to have its
    7759              :    own reduction accumulator since one of the main goals of unrolling a
    7760              :    reduction is to reduce the aggregate loop-carried latency.  */
    7761        72216 :   if (ncopies > 1
    7762        72216 :       && !reduc_chain
    7763         8027 :       && SLP_TREE_LANES (slp_node) == 1
    7764         7859 :       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
    7765         7836 :       && reduc_chain_length == 1
    7766         7422 :       && loop_vinfo->suggested_unroll_factor == 1)
    7767        72216 :     single_defuse_cycle = true;
    7768              : 
    7769        72216 :   if (single_defuse_cycle && !lane_reducing)
    7770              :     {
    7771         6474 :       gcc_assert (op.code != COND_EXPR);
    7772              : 
    7773              :       /* 4. check support for the operation in the loop
    7774              : 
    7775              :          This isn't necessary for the lane reduction codes, since they
    7776              :          can only be produced by pattern matching, and it's up to the
    7777              :          pattern matcher to test for support.  The main reason for
    7778              :          specifically skipping this step is to avoid rechecking whether
    7779              :          mixed-sign dot-products can be implemented using signed
    7780              :          dot-products.  */
    7781         6474 :       machine_mode vec_mode = TYPE_MODE (vectype_in);
    7782         6474 :       if (!directly_supported_p (op.code, vectype_in, optab_vector))
    7783              :         {
    7784         2065 :           if (dump_enabled_p ())
    7785           44 :             dump_printf (MSG_NOTE, "op not supported by target.\n");
    7786         4130 :           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
    7787         2065 :               || !vect_can_vectorize_without_simd_p (op.code))
    7788              :             single_defuse_cycle = false;
    7789              :           else
    7790            5 :             if (dump_enabled_p ())
    7791            0 :               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
    7792              :         }
    7793              : 
    7794         6474 :       if (vect_emulated_vector_p (vectype_in)
    7795         6474 :           && !vect_can_vectorize_without_simd_p (op.code))
    7796              :         {
    7797            0 :           if (dump_enabled_p ())
    7798            0 :             dump_printf (MSG_NOTE, "using word mode not possible.\n");
    7799            0 :           return false;
    7800              :         }
    7801              :     }
    7802        72216 :   if (dump_enabled_p () && single_defuse_cycle)
    7803          701 :     dump_printf_loc (MSG_NOTE, vect_location,
    7804              :                      "using single def-use cycle for reduction by reducing "
    7805              :                      "multiple vectors to one in the loop body\n");
    7806        72216 :   VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
    7807              : 
    7808              :   /* For lane-reducing operation, the below processing related to single
    7809              :      defuse-cycle will be done in its own vectorizable function.  One more
    7810              :      thing to note is that the operation must not be involved in fold-left
    7811              :      reduction.  */
    7812        72216 :   single_defuse_cycle &= !lane_reducing;
    7813              : 
    7814        72216 :   if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
    7815        28272 :     for (i = 0; i < (int) op.num_ops; i++)
    7816        19646 :       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
    7817              :         {
    7818            0 :           if (dump_enabled_p ())
    7819            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    7820              :                              "incompatible vector types for invariants\n");
    7821            0 :           return false;
    7822              :         }
    7823              : 
    7824        72216 :   vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
    7825              :                              reduction_type, ncopies, cost_vec);
    7826              :   /* Cost the reduction op inside the loop if transformed via
    7827              :      vect_transform_reduction for non-lane-reducing operation.  Otherwise
    7828              :      this is costed by the separate vectorizable_* routines.  */
    7829        72216 :   if (single_defuse_cycle)
    7830         4414 :     record_stmt_cost (cost_vec, ncopies, vector_stmt,
    7831              :                       slp_for_stmt_info, 0, vect_body);
    7832              : 
    7833        72216 :   if (dump_enabled_p ()
    7834        72216 :       && reduction_type == FOLD_LEFT_REDUCTION)
    7835          264 :     dump_printf_loc (MSG_NOTE, vect_location,
    7836              :                      "using an in-order (fold-left) reduction.\n");
    7837        72216 :   SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
    7838              : 
    7839              :   /* All but single defuse-cycle optimized and fold-left reductions go
    7840              :      through their own vectorizable_* routines.  */
    7841        72216 :   stmt_vec_info tem
    7842        72216 :     = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
    7843        72216 :   if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
    7844        63590 :     STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
    7845              :   else
    7846              :     {
    7847         8626 :       STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
    7848         8626 :       if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
    7849         3980 :         vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
    7850              :                                                     slp_node, op.code, op.type,
    7851              :                                                     vectype_in);
    7852              :     }
    7853              :   return true;
    7854              : }
    7855              : 
    7856              : /* STMT_INFO is a dot-product reduction whose multiplication operands
    7857              :    have different signs.  Emit a sequence to emulate the operation
    7858              :    using a series of signed DOT_PROD_EXPRs and return the last
    7859              :    statement generated.  VEC_DEST is the result of the vector operation
    7860              :    and VOP lists its inputs.  */
    7861              : 
    7862              : static gassign *
    7863            4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
    7864              :                              gimple_stmt_iterator *gsi, tree vec_dest,
    7865              :                              tree vop[3])
    7866              : {
    7867            4 :   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
    7868            4 :   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
    7869            4 :   tree narrow_elttype = TREE_TYPE (narrow_vectype);
    7870            4 :   gimple *new_stmt;
    7871              : 
    7872              :   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
    7873            4 :   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
    7874            0 :     std::swap (vop[0], vop[1]);
    7875              : 
    7876              :   /* Convert all inputs to signed types.  */
    7877           12 :   for (int i = 1; i < 3; ++i)
    7878            8 :     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
    7879              :       {
    7880            0 :         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
    7881            0 :         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
    7882            0 :         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
    7883            0 :         vop[i] = tmp;
    7884              :       }
    7885              : 
    7886              :   /* In the comments below we assume 8-bit inputs for simplicity,
    7887              :      but the approach works for any full integer type.  */
    7888              : 
    7889              :   /* Create a vector of -128.  */
    7890            4 :   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
    7891            4 :   tree min_narrow = build_vector_from_val (TREE_TYPE (vop[0]),
    7892            4 :                                            fold_convert
    7893              :                                              (TREE_TYPE (TREE_TYPE (vop[0])),
    7894              :                                               min_narrow_elttype));
    7895              : 
    7896              :   /* Create a vector of 64.  */
    7897            4 :   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
    7898            4 :   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
    7899            4 :   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
    7900              : 
    7901              :   /* Emit: SUB_RES = VOP[0] - 128 in an unsigned type.  */
    7902            4 :   tree sub_res = make_ssa_name (TREE_TYPE (vop[0]));
    7903            4 :   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
    7904            4 :   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
    7905              : 
    7906            4 :   vop[0] = make_ssa_name (narrow_vectype);
    7907            4 :   new_stmt = gimple_build_assign (vop[0], VIEW_CONVERT_EXPR,
    7908              :                                   build1 (VIEW_CONVERT_EXPR, narrow_vectype,
    7909              :                                           sub_res));
    7910            4 :   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
    7911              : 
    7912              :   /* Emit:
    7913              : 
    7914              :        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
    7915              :        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
    7916              :        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
    7917              : 
    7918              :      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
    7919              :      Doing the two 64 * y steps first allows more time to compute x.  */
    7920            4 :   tree stage1 = make_ssa_name (wide_vectype);
    7921            4 :   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
    7922              :                                   vop[1], half_narrow, vop[2]);
    7923            4 :   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
    7924              : 
    7925            4 :   tree stage2 = make_ssa_name (wide_vectype);
    7926            4 :   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
    7927              :                                   vop[1], half_narrow, stage1);
    7928            4 :   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
    7929              : 
    7930            4 :   tree stage3 = make_ssa_name (wide_vectype);
    7931            4 :   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
    7932              :                                   vop[0], vop[1], stage2);
    7933            4 :   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
    7934              : 
    7935              :   /* Convert STAGE3 to the reduction type.  */
    7936            4 :   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
    7937            4 : }
    7938              : 
    7939              : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
    7940              :    value.  */
    7941              : 
    7942              : bool
    7943         2636 : vect_transform_reduction (loop_vec_info loop_vinfo,
    7944              :                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
    7945              :                           slp_tree slp_node)
    7946              : {
    7947         2636 :   tree vectype_out = SLP_TREE_VECTYPE (slp_node);
    7948         2636 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    7949         2636 :   unsigned vec_num;
    7950              : 
    7951         2636 :   vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
    7952              : 
    7953         2636 :   if (nested_in_vect_loop_p (loop, stmt_info))
    7954              :     {
    7955            0 :       loop = loop->inner;
    7956            0 :       gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
    7957              :                   == vect_double_reduction_def);
    7958              :     }
    7959              : 
    7960         2636 :   gimple_match_op op;
    7961         2636 :   if (!gimple_extract_op (stmt_info->stmt, &op))
    7962            0 :     gcc_unreachable ();
    7963              : 
    7964              :   /* All uses but the last are expected to be defined in the loop.
    7965              :      The last use is the reduction variable.  In case of nested cycle this
    7966              :      assumption is not true: we use reduc_index to record the index of the
    7967              :      reduction variable.  */
    7968         2636 :   int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
    7969         2636 :   tree vectype_in = SLP_TREE_VECTYPE (slp_node);
    7970         2636 :   if (lane_reducing_op_p (op.code))
    7971          262 :     vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
    7972              : 
    7973         2636 :   vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
    7974              : 
    7975         2636 :   code_helper code = canonicalize_code (op.code, op.type);
    7976         2636 :   internal_fn cond_fn
    7977          483 :     = ((code.is_internal_fn ()
    7978          483 :         && internal_fn_mask_index ((internal_fn)code) != -1)
    7979         2636 :        ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
    7980              : 
    7981         2636 :   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
    7982         2636 :   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
    7983         2636 :   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
    7984              : 
    7985              :   /* Transform.  */
    7986         2636 :   tree new_temp = NULL_TREE;
    7987        18452 :   auto_vec<tree> vec_oprnds[3];
    7988              : 
    7989         2636 :   if (dump_enabled_p ())
    7990          770 :     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
    7991              : 
    7992              :   /* A binary COND_OP reduction must have the same definition and else
    7993              :      value. */
    7994         3119 :   bool cond_fn_p = code.is_internal_fn ()
    7995          483 :     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
    7996          483 :   if (cond_fn_p)
    7997              :     {
    7998          483 :       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
    7999              :                   || code == IFN_COND_MUL || code == IFN_COND_AND
    8000              :                   || code == IFN_COND_IOR || code == IFN_COND_XOR
    8001              :                   || code == IFN_COND_MIN || code == IFN_COND_MAX);
    8002          483 :       gcc_assert (op.num_ops == 4
    8003              :                   && (op.ops[reduc_index]
    8004              :                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
    8005              :     }
    8006              : 
    8007         2636 :   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
    8008              : 
    8009         2636 :   vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
    8010         2636 :   if (reduction_type == FOLD_LEFT_REDUCTION)
    8011              :     {
    8012          895 :       internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
    8013          895 :       gcc_assert (code.is_tree_code () || cond_fn_p);
    8014          895 :       return vectorize_fold_left_reduction
    8015          895 :           (loop_vinfo, stmt_info, gsi, slp_node,
    8016          895 :            code, reduc_fn, op.num_ops, vectype_in,
    8017          895 :            reduc_index, masks, lens);
    8018              :     }
    8019              : 
    8020         1741 :   bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
    8021         1741 :   bool lane_reducing = lane_reducing_op_p (code);
    8022         1479 :   gcc_assert (single_defuse_cycle || lane_reducing);
    8023              : 
    8024         1741 :   if (lane_reducing)
    8025              :     {
    8026              :       /* The last operand of lane-reducing op is for reduction.  */
    8027          262 :       gcc_assert (reduc_index == (int) op.num_ops - 1);
    8028              :     }
    8029              : 
    8030              :   /* Create the destination vector  */
    8031         1741 :   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
    8032         1741 :   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
    8033              : 
    8034              :   /* Get NCOPIES vector definitions for all operands except the reduction
    8035              :      definition.  */
    8036         1741 :   if (!cond_fn_p)
    8037              :     {
    8038         1288 :       gcc_assert (reduc_index >= 0 && reduc_index <= 2);
    8039         2121 :       vect_get_vec_defs (loop_vinfo, slp_node,
    8040         1288 :                          single_defuse_cycle && reduc_index == 0
    8041              :                          ? NULL_TREE : op.ops[0], &vec_oprnds[0],
    8042         1288 :                          single_defuse_cycle && reduc_index == 1
    8043              :                          ? NULL_TREE : op.ops[1], &vec_oprnds[1],
    8044         1288 :                          op.num_ops == 3
    8045          262 :                          && !(single_defuse_cycle && reduc_index == 2)
    8046              :                          ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
    8047              :     }
    8048              :   else
    8049              :     {
    8050              :       /* For a conditional operation pass the truth type as mask
    8051              :          vectype.  */
    8052          453 :       gcc_assert (single_defuse_cycle
    8053              :                   && (reduc_index == 1 || reduc_index == 2));
    8054          453 :       vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
    8055              :                          &vec_oprnds[0],
    8056              :                          reduc_index == 1 ? NULL_TREE : op.ops[1],
    8057              :                          &vec_oprnds[1],
    8058              :                          reduc_index == 2 ? NULL_TREE : op.ops[2],
    8059              :                          &vec_oprnds[2]);
    8060              :     }
    8061              : 
    8062              :   /* For single def-use cycles get one copy of the vectorized reduction
    8063              :      definition.  */
    8064         1741 :   if (single_defuse_cycle)
    8065              :     {
    8066         1650 :       vect_get_vec_defs (loop_vinfo, slp_node,
    8067              :                          reduc_index == 0 ? op.ops[0] : NULL_TREE,
    8068              :                          &vec_oprnds[0],
    8069              :                          reduc_index == 1 ? op.ops[1] : NULL_TREE,
    8070              :                          &vec_oprnds[1],
    8071              :                          reduc_index == 2 ? op.ops[2] : NULL_TREE,
    8072              :                          &vec_oprnds[2]);
    8073              :     }
    8074           91 :   else if (lane_reducing)
    8075              :     {
    8076              :       /* For normal reduction, consistency between vectorized def/use is
    8077              :          naturally ensured when mapping from scalar statement.  But if lane-
    8078              :          reducing op is involved in reduction, thing would become somewhat
    8079              :          complicated in that the op's result and operand for accumulation are
    8080              :          limited to less lanes than other operands, which certainly causes
    8081              :          def/use mismatch on adjacent statements around the op if do not have
    8082              :          any kind of specific adjustment.  One approach is to refit lane-
    8083              :          reducing op in the way of introducing new trivial pass-through copies
    8084              :          to fix possible def/use gap, so as to make it behave like a normal op.
    8085              :          And vector reduction PHIs are always generated to the full extent, no
    8086              :          matter lane-reducing op exists or not.  If some copies or PHIs are
    8087              :          actually superfluous, they would be cleaned up by passes after
    8088              :          vectorization.  An example for single-lane slp, lane-reducing ops
    8089              :          with mixed input vectypes in a reduction chain, is given as below.
    8090              :          Similarly, this handling is applicable for multiple-lane slp as well.
    8091              : 
    8092              :            int sum = 1;
    8093              :            for (i)
    8094              :              {
    8095              :                sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
    8096              :                sum += w[i];               // widen-sum <vector(16) char>
    8097              :                sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
    8098              :                sum += n[i];               // normal <vector(4) int>
    8099              :              }
    8100              : 
    8101              :          The vector size is 128-bit,vectorization factor is 16.  Reduction
    8102              :          statements would be transformed as:
    8103              : 
    8104              :            vector<4> int sum_v0 = { 0, 0, 0, 1 };
    8105              :            vector<4> int sum_v1 = { 0, 0, 0, 0 };
    8106              :            vector<4> int sum_v2 = { 0, 0, 0, 0 };
    8107              :            vector<4> int sum_v3 = { 0, 0, 0, 0 };
    8108              : 
    8109              :            for (i / 16)
    8110              :              {
    8111              :                sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
    8112              :                sum_v1 = sum_v1;  // copy
    8113              :                sum_v2 = sum_v2;  // copy
    8114              :                sum_v3 = sum_v3;  // copy
    8115              : 
    8116              :                sum_v0 = sum_v0;  // copy
    8117              :                sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
    8118              :                sum_v2 = sum_v2;  // copy
    8119              :                sum_v3 = sum_v3;  // copy
    8120              : 
    8121              :                sum_v0 = sum_v0;  // copy
    8122              :                sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
    8123              :                sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
    8124              :                sum_v3 = sum_v3;  // copy
    8125              : 
    8126              :                sum_v0 += n_v0[i: 0  ~ 3 ];
    8127              :                sum_v1 += n_v1[i: 4  ~ 7 ];
    8128              :                sum_v2 += n_v2[i: 8  ~ 11];
    8129              :                sum_v3 += n_v3[i: 12 ~ 15];
    8130              :              }
    8131              : 
    8132              :          Moreover, for a higher instruction parallelism in final vectorized
    8133              :          loop, it is considered to make those effective vector lane-reducing
    8134              :          ops be distributed evenly among all def-use cycles.  In the above
    8135              :          example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
    8136              :          cycles, instruction dependency among them could be eliminated.  */
    8137           91 :       unsigned effec_ncopies = vec_oprnds[0].length ();
    8138           91 :       unsigned total_ncopies = vec_oprnds[reduc_index].length ();
    8139              : 
    8140           91 :       gcc_assert (effec_ncopies <= total_ncopies);
    8141              : 
    8142           91 :       if (effec_ncopies < total_ncopies)
    8143              :         {
    8144          273 :           for (unsigned i = 0; i < op.num_ops - 1; i++)
    8145              :             {
    8146          364 :               gcc_assert (vec_oprnds[i].length () == effec_ncopies);
    8147          182 :               vec_oprnds[i].safe_grow_cleared (total_ncopies);
    8148              :             }
    8149              :         }
    8150              : 
    8151           91 :       tree reduc_vectype_in = vectype_in;
    8152           91 :       gcc_assert (reduc_vectype_in);
    8153              : 
    8154           91 :       unsigned effec_reduc_ncopies
    8155           91 :         = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
    8156              : 
    8157           91 :       gcc_assert (effec_ncopies <= effec_reduc_ncopies);
    8158              : 
    8159           91 :       if (effec_ncopies < effec_reduc_ncopies)
    8160              :         {
    8161              :           /* Find suitable def-use cycles to generate vectorized statements
    8162              :              into, and reorder operands based on the selection.  */
    8163            0 :           unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
    8164            0 :           unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
    8165              : 
    8166            0 :           gcc_assert (curr_pos < effec_reduc_ncopies);
    8167            0 :           VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
    8168              : 
    8169            0 :           if (curr_pos)
    8170              :             {
    8171            0 :               unsigned count = effec_reduc_ncopies - effec_ncopies;
    8172            0 :               unsigned start = curr_pos - count;
    8173              : 
    8174            0 :               if ((int) start < 0)
    8175              :                 {
    8176            0 :                   count = curr_pos;
    8177            0 :                   start = 0;
    8178              :                 }
    8179              : 
    8180            0 :               for (unsigned i = 0; i < op.num_ops - 1; i++)
    8181              :                 {
    8182            0 :                   for (unsigned j = effec_ncopies; j > start; j--)
    8183              :                     {
    8184            0 :                       unsigned k = j - 1;
    8185            0 :                       std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
    8186            0 :                       gcc_assert (!vec_oprnds[i][k]);
    8187              :                     }
    8188              :                 }
    8189              :             }
    8190              :         }
    8191              :     }
    8192              : 
    8193         1741 :   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
    8194         3002 :   unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
    8195         1741 :   unsigned mask_index = 0;
    8196              : 
    8197         7654 :   for (unsigned i = 0; i < num; ++i)
    8198              :     {
    8199         5913 :       gimple *new_stmt;
    8200         5913 :       tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
    8201         5913 :       if (!vop[0] || !vop[1])
    8202              :         {
    8203          479 :           tree reduc_vop = vec_oprnds[reduc_index][i];
    8204              : 
    8205              :           /* If could not generate an effective vector statement for current
    8206              :              portion of reduction operand, insert a trivial copy to simply
    8207              :              handle over the operand to other dependent statements.  */
    8208          479 :           gcc_assert (reduc_vop);
    8209              : 
    8210          479 :           if (TREE_CODE (reduc_vop) == SSA_NAME
    8211          479 :               && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
    8212          479 :             new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
    8213              :           else
    8214              :             {
    8215            0 :               new_temp = make_ssa_name (vec_dest);
    8216            0 :               new_stmt = gimple_build_assign (new_temp, reduc_vop);
    8217            0 :               vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
    8218              :                                            gsi);
    8219              :             }
    8220              :         }
    8221         5434 :       else if (masked_loop_p && !mask_by_cond_expr)
    8222              :         {
    8223              :           /* No conditional ifns have been defined for lane-reducing op
    8224              :              yet.  */
    8225           16 :           gcc_assert (!lane_reducing);
    8226              : 
    8227           16 :           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
    8228              :                                           vec_num, vectype_in,
    8229              :                                           mask_index++);
    8230           16 :           gcall *call;
    8231           24 :           if (code.is_internal_fn () && cond_fn_p)
    8232              :             {
    8233           16 :               gcc_assert (op.num_ops >= 3
    8234              :                           && internal_fn_mask_index (internal_fn (code)) == 0);
    8235            8 :               vop[2] = vec_oprnds[2][i];
    8236            8 :               mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
    8237              :                                        mask, vop[0], gsi);
    8238            8 :               call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
    8239              :                                                  vop[2], vop[reduc_index]);
    8240              :             }
    8241              :           else
    8242            8 :             call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
    8243              :                                                vop[1], vop[reduc_index]);
    8244           16 :           new_temp = make_ssa_name (vec_dest, call);
    8245           16 :           gimple_call_set_lhs (call, new_temp);
    8246           16 :           gimple_call_set_nothrow (call, true);
    8247           16 :           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
    8248           16 :           new_stmt = call;
    8249              :         }
    8250              :       else
    8251              :         {
    8252         5418 :           if (op.num_ops >= 3)
    8253         1772 :             vop[2] = vec_oprnds[2][i];
    8254              : 
    8255         5418 :           if (masked_loop_p && mask_by_cond_expr)
    8256              :             {
    8257            4 :               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
    8258              :                                               vec_num, vectype_in,
    8259              :                                               mask_index++);
    8260            4 :               build_vect_cond_expr (code, vop, mask, gsi);
    8261              :             }
    8262              : 
    8263         5418 :           if (emulated_mixed_dot_prod)
    8264            4 :             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
    8265              :                                                     vec_dest, vop);
    8266              : 
    8267         6756 :           else if (code.is_internal_fn () && !cond_fn_p)
    8268            0 :             new_stmt = gimple_build_call_internal (internal_fn (code),
    8269              :                                                    op.num_ops,
    8270              :                                                    vop[0], vop[1], vop[2]);
    8271         6756 :           else if (code.is_internal_fn () && cond_fn_p)
    8272         1342 :             new_stmt = gimple_build_call_internal (internal_fn (code),
    8273              :                                                    op.num_ops,
    8274              :                                                    vop[0], vop[1], vop[2],
    8275              :                                                    vop[reduc_index]);
    8276              :           else
    8277         4072 :             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
    8278              :                                             vop[0], vop[1], vop[2]);
    8279         5418 :           new_temp = make_ssa_name (vec_dest, new_stmt);
    8280         5418 :           gimple_set_lhs (new_stmt, new_temp);
    8281         5418 :           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
    8282              :         }
    8283              : 
    8284         5913 :       if (single_defuse_cycle && i < num - 1)
    8285         3535 :         vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
    8286              :       else
    8287         2378 :         slp_node->push_vec_def (new_stmt);
    8288              :     }
    8289              : 
    8290              :   return true;
    8291        10544 : }
    8292              : 
    8293              : /* Transform phase of a cycle PHI.  */
    8294              : 
    8295              : bool
    8296        23727 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
    8297              :                           stmt_vec_info stmt_info,
    8298              :                           slp_tree slp_node, slp_instance slp_node_instance)
    8299              : {
    8300        23727 :   tree vectype_out = SLP_TREE_VECTYPE (slp_node);
    8301        23727 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    8302        23727 :   int i;
    8303        23727 :   bool nested_cycle = false;
    8304        23727 :   int vec_num;
    8305              : 
    8306        23865 :   if (nested_in_vect_loop_p (loop, stmt_info))
    8307              :     {
    8308              :       loop = loop->inner;
    8309              :       nested_cycle = true;
    8310              :     }
    8311              : 
    8312        23727 :   vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
    8313        23727 :   if (reduc_info
    8314        23065 :       && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
    8315        23065 :           || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
    8316              :     /* Leave the scalar phi in place.  */
    8317              :     return true;
    8318              : 
    8319        22170 :   if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
    8320          127 :     dump_printf_loc (MSG_NOTE, vect_location,
    8321              :                      "vectorizing a reduction chain\n");
    8322              : 
    8323        22832 :   vec_num = vect_get_num_copies (loop_vinfo, slp_node);
    8324              : 
    8325              :   /* Check whether we should use a single PHI node and accumulate
    8326              :      vectors to one before the backedge.  */
    8327        22832 :   if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
    8328        22832 :     vec_num = 1;
    8329              : 
    8330              :   /* Create the destination vector  */
    8331        22832 :   gphi *phi = as_a <gphi *> (stmt_info->stmt);
    8332        22832 :   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
    8333              :                                                vectype_out);
    8334              : 
    8335              :   /* Get the loop-entry arguments.  */
    8336        22832 :   auto_vec<tree> vec_initial_defs;
    8337        22832 :   vec_initial_defs.reserve (vec_num);
    8338              :   /* Optimize: if initial_def is for REDUC_MAX smaller than the base
    8339              :      and we can't use zero for induc_val, use initial_def.  Similarly
    8340              :      for REDUC_MIN and initial_def larger than the base.  */
    8341        22832 :   if (reduc_info
    8342        22170 :       && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
    8343              :     {
    8344           62 :       gcc_assert (SLP_TREE_LANES (slp_node) == 1);
    8345           62 :       tree initial_def = vect_phi_initial_value (phi);
    8346           62 :       VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
    8347           62 :       tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
    8348           62 :       if (TREE_CODE (initial_def) == INTEGER_CST
    8349           60 :           && !integer_zerop (induc_val)
    8350          122 :           && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
    8351           42 :                && tree_int_cst_lt (initial_def, induc_val))
    8352           58 :               || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
    8353           18 :                   && tree_int_cst_lt (induc_val, initial_def))))
    8354              :         {
    8355            2 :           induc_val = initial_def;
    8356              :           /* Communicate we used the initial_def to epilouge
    8357              :              generation.  */
    8358            2 :           VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
    8359              :         }
    8360           62 :       vec_initial_defs.quick_push
    8361           62 :         (build_vector_from_val (vectype_out, induc_val));
    8362           62 :     }
    8363        22770 :   else if (nested_cycle)
    8364              :     {
    8365          748 :       unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
    8366          748 :       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
    8367              :                          &vec_initial_defs);
    8368              :     }
    8369              :   else
    8370              :     {
    8371        22022 :       gcc_assert (slp_node == slp_node_instance->reduc_phis);
    8372        22022 :       vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
    8373        22022 :       vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
    8374              : 
    8375        22022 :       unsigned int num_phis = stmts.length ();
    8376        22022 :       if (reduc_info->is_reduc_chain)
    8377          200 :         num_phis = 1;
    8378        22022 :       initial_values.reserve (num_phis);
    8379        44489 :       for (unsigned int i = 0; i < num_phis; ++i)
    8380              :         {
    8381        22467 :           gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
    8382        22467 :           initial_values.quick_push (vect_phi_initial_value (this_phi));
    8383              :         }
    8384        22022 :       tree neutral_op = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
    8385        22022 :       if (vec_num == 1
    8386        22022 :           && vect_find_reusable_accumulator (loop_vinfo,
    8387              :                                              reduc_info, vectype_out))
    8388              :         ;
    8389              :       /* Try to simplify the vector initialization by applying an
    8390              :          adjustment after the reduction has been performed.  This
    8391              :          can also break a critical path but on the other hand
    8392              :          requires to keep the initial value live across the loop.  */
    8393        17911 :       else if (neutral_op
    8394        17336 :                && initial_values.length () == 1
    8395        17152 :                && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
    8396        34986 :                && !operand_equal_p (neutral_op, initial_values[0]))
    8397              :         {
    8398        12155 :           VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
    8399        12155 :             = initial_values[0];
    8400        12155 :           initial_values[0] = neutral_op;
    8401              :         }
    8402        22022 :       if (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
    8403         4111 :           || loop_vinfo->main_loop_edge)
    8404        43598 :         get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
    8405              :                                         &vec_initial_defs, vec_num,
    8406              :                                         stmts.length (), neutral_op);
    8407              :     }
    8408              : 
    8409        22832 :   if (reduc_info)
    8410        22170 :   if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
    8411              :     {
    8412         4111 :       tree def = accumulator->reduc_input;
    8413         4111 :       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
    8414              :         {
    8415         4108 :           unsigned int nreduc;
    8416         8216 :           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
    8417         4108 :                                             (TREE_TYPE (def)),
    8418         4108 :                                           TYPE_VECTOR_SUBPARTS (vectype_out),
    8419              :                                           &nreduc);
    8420            0 :           gcc_assert (res);
    8421         4108 :           gimple_seq stmts = NULL;
    8422              :           /* Reduce the single vector to a smaller one.  */
    8423         4108 :           if (nreduc != 1)
    8424              :             {
    8425              :               /* Perform the reduction in the appropriate type.  */
    8426         4108 :               tree rvectype = vectype_out;
    8427         4108 :               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
    8428         4108 :                                               TREE_TYPE (TREE_TYPE (def))))
    8429          235 :                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
    8430              :                                               TYPE_VECTOR_SUBPARTS
    8431          470 :                                                 (vectype_out));
    8432         4108 :               def = vect_create_partial_epilog (def, rvectype,
    8433              :                                                 VECT_REDUC_INFO_CODE
    8434              :                                                   (reduc_info),
    8435              :                                                 &stmts);
    8436              :             }
    8437              :           /* The epilogue loop might use a different vector mode, like
    8438              :              VNx2DI vs. V2DI.  */
    8439         4108 :           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
    8440              :             {
    8441            0 :               tree reduc_type = build_vector_type_for_mode
    8442            0 :                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
    8443            0 :               def = gimple_convert (&stmts, reduc_type, def);
    8444              :             }
    8445              :           /* Adjust the input so we pick up the partially reduced value
    8446              :              for the skip edge in vect_create_epilog_for_reduction.  */
    8447         4108 :           accumulator->reduc_input = def;
    8448              :           /* And the reduction could be carried out using a different sign.  */
    8449         4108 :           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
    8450          235 :             def = gimple_convert (&stmts, vectype_out, def);
    8451         4108 :           edge e;
    8452         4108 :           if ((e = loop_vinfo->main_loop_edge)
    8453         4108 :               || (e = loop_vinfo->skip_this_loop_edge))
    8454              :             {
    8455              :               /* While we'd like to insert on the edge this will split
    8456              :                  blocks and disturb bookkeeping, we also will eventually
    8457              :                  need this on the skip edge.  Rely on sinking to
    8458              :                  fixup optimal placement and insert in the pred.  */
    8459         3885 :               gimple_stmt_iterator gsi = gsi_last_bb (e->src);
    8460              :               /* Insert before a cond that eventually skips the
    8461              :                  epilogue.  */
    8462         3885 :               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
    8463         3868 :                 gsi_prev (&gsi);
    8464         3885 :               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
    8465              :             }
    8466              :           else
    8467          223 :             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
    8468              :                                               stmts);
    8469              :         }
    8470         4111 :       if (loop_vinfo->main_loop_edge)
    8471         3888 :         vec_initial_defs[0]
    8472         3888 :           = vect_get_main_loop_result (loop_vinfo, def,
    8473         3888 :                                        vec_initial_defs[0]);
    8474              :       else
    8475          223 :         vec_initial_defs.safe_push (def);
    8476              :     }
    8477              : 
    8478              :   /* Generate the reduction PHIs upfront.  */
    8479        47553 :   for (i = 0; i < vec_num; i++)
    8480              :     {
    8481        24721 :       tree vec_init_def = vec_initial_defs[i];
    8482              :       /* Create the reduction-phi that defines the reduction
    8483              :          operand.  */
    8484        24721 :       gphi *new_phi = create_phi_node (vec_dest, loop->header);
    8485        24721 :       add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
    8486              :                    UNKNOWN_LOCATION);
    8487              : 
    8488              :       /* The loop-latch arg is set in epilogue processing.  */
    8489              : 
    8490        24721 :       slp_node->push_vec_def (new_phi);
    8491              :     }
    8492              : 
    8493        22832 :   return true;
    8494        22832 : }
    8495              : 
    8496              : /* Vectorizes LC PHIs.  */
    8497              : 
    8498              : bool
    8499       181833 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
    8500              :                      stmt_vec_info stmt_info,
    8501              :                      slp_tree slp_node)
    8502              : {
    8503       181833 :   if (!loop_vinfo
    8504       181833 :       || !is_a <gphi *> (stmt_info->stmt)
    8505       217674 :       || gimple_phi_num_args (stmt_info->stmt) != 1)
    8506              :     return false;
    8507              : 
    8508          821 :   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
    8509            0 :       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
    8510              :     return false;
    8511              : 
    8512              :   /* Deal with copies from externs or constants that disguise as
    8513              :      loop-closed PHI nodes (PR97886).  */
    8514          821 :   if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
    8515              :                                          SLP_TREE_VECTYPE (slp_node)))
    8516              :     {
    8517            0 :       if (dump_enabled_p ())
    8518            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    8519              :                          "incompatible vector types for invariants\n");
    8520            0 :       return false;
    8521              :     }
    8522              : 
    8523              :   /* ???  This can happen with data vs. mask uses of boolean.  */
    8524          821 :   if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
    8525          821 :                                   SLP_TREE_VECTYPE
    8526              :                                     (SLP_TREE_CHILDREN (slp_node)[0])))
    8527              :     {
    8528            0 :       if (dump_enabled_p ())
    8529            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    8530              :                          "missed mask promotion\n");
    8531            0 :       return false;
    8532              :     }
    8533              : 
    8534          821 :   SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
    8535          821 :   return true;
    8536              : }
    8537              : 
    8538              : bool
    8539          530 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
    8540              :                        stmt_vec_info stmt_info,
    8541              :                        slp_tree slp_node)
    8542              : {
    8543              : 
    8544          530 :   tree vectype = SLP_TREE_VECTYPE (slp_node);
    8545          530 :   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
    8546          530 :   basic_block bb = gimple_bb (stmt_info->stmt);
    8547          530 :   edge e = single_pred_edge (bb);
    8548          530 :   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
    8549          530 :   auto_vec<tree> vec_oprnds;
    8550         1060 :   vect_get_vec_defs (loop_vinfo, slp_node,
    8551          530 :                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
    8552         1175 :   for (unsigned i = 0; i < vec_oprnds.length (); i++)
    8553              :     {
    8554              :       /* Create the vectorized LC PHI node.  */
    8555          645 :       gphi *new_phi = create_phi_node (vec_dest, bb);
    8556          645 :       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
    8557          645 :       slp_node->push_vec_def (new_phi);
    8558              :     }
    8559              : 
    8560          530 :   return true;
    8561          530 : }
    8562              : 
    8563              : /* Vectorizes PHIs.  */
    8564              : 
    8565              : bool
    8566       138388 : vectorizable_phi (bb_vec_info vinfo,
    8567              :                   stmt_vec_info stmt_info,
    8568              :                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
    8569              : {
    8570       138388 :   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
    8571              :     return false;
    8572              : 
    8573        72105 :   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
    8574              :     return false;
    8575              : 
    8576        72105 :   tree vectype = SLP_TREE_VECTYPE (slp_node);
    8577              : 
    8578        72105 :   if (cost_vec) /* transformation not required.  */
    8579              :     {
    8580              :       slp_tree child;
    8581              :       unsigned i;
    8582       197654 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
    8583       139731 :         if (!child)
    8584              :           {
    8585            0 :             if (dump_enabled_p ())
    8586            0 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    8587              :                                "PHI node with unvectorized backedge def\n");
    8588            0 :             return false;
    8589              :           }
    8590       139731 :         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
    8591              :           {
    8592           18 :             if (dump_enabled_p ())
    8593            2 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    8594              :                                "incompatible vector types for invariants\n");
    8595           18 :             return false;
    8596              :           }
    8597       139713 :         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
    8598       139713 :                  && !useless_type_conversion_p (vectype,
    8599              :                                                 SLP_TREE_VECTYPE (child)))
    8600              :           {
    8601              :             /* With bools we can have mask and non-mask precision vectors
    8602              :                or different non-mask precisions.  while pattern recog is
    8603              :                supposed to guarantee consistency here bugs in it can cause
    8604              :                mismatches (PR103489 and PR103800 for example).
    8605              :                Deal with them here instead of ICEing later.  */
    8606           18 :             if (dump_enabled_p ())
    8607            8 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    8608              :                                "incompatible vector type setup from "
    8609              :                                "bool pattern detection\n");
    8610           18 :             return false;
    8611              :           }
    8612              : 
    8613              :       /* For single-argument PHIs assume coalescing which means zero cost
    8614              :          for the scalar and the vector PHIs.  This avoids artificially
    8615              :          favoring the vector path (but may pessimize it in some cases).  */
    8616        57923 :       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
    8617        52469 :         record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
    8618              :                           vector_stmt, slp_node, vectype, 0, vect_body);
    8619        57923 :       SLP_TREE_TYPE (slp_node) = phi_info_type;
    8620        57923 :       return true;
    8621              :     }
    8622              : 
    8623        14146 :   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
    8624        14146 :   basic_block bb = gimple_bb (stmt_info->stmt);
    8625        14146 :   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
    8626        14146 :   auto_vec<gphi *> new_phis;
    8627        51634 :   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
    8628              :     {
    8629        37488 :       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
    8630              : 
    8631              :       /* Skip not yet vectorized defs.  */
    8632        37935 :       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
    8633        37488 :           && SLP_TREE_VEC_DEFS (child).is_empty ())
    8634          447 :         continue;
    8635              : 
    8636        37041 :       auto_vec<tree> vec_oprnds;
    8637        37041 :       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
    8638        37041 :       if (!new_phis.exists ())
    8639              :         {
    8640        14146 :           new_phis.create (vec_oprnds.length ());
    8641        29933 :           for (unsigned j = 0; j < vec_oprnds.length (); j++)
    8642              :             {
    8643              :               /* Create the vectorized LC PHI node.  */
    8644        15787 :               new_phis.quick_push (create_phi_node (vec_dest, bb));
    8645        15787 :               slp_node->push_vec_def (new_phis[j]);
    8646              :             }
    8647              :         }
    8648        37041 :       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
    8649        80827 :       for (unsigned j = 0; j < vec_oprnds.length (); j++)
    8650        43786 :         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
    8651        37041 :     }
    8652              :   /* We should have at least one already vectorized child.  */
    8653        14146 :   gcc_assert (new_phis.exists ());
    8654              : 
    8655        14146 :   return true;
    8656        14146 : }
    8657              : 
    8658              : /* Vectorizes first order recurrences.  An overview of the transformation
    8659              :    is described below. Suppose we have the following loop.
    8660              : 
    8661              :      int t = 0;
    8662              :      for (int i = 0; i < n; ++i)
    8663              :        {
    8664              :          b[i] = a[i] - t;
    8665              :          t = a[i];
    8666              :        }
    8667              : 
    8668              :    There is a first-order recurrence on 'a'. For this loop, the scalar IR
    8669              :    looks (simplified) like:
    8670              : 
    8671              :     scalar.preheader:
    8672              :       init = 0;
    8673              : 
    8674              :     scalar.body:
    8675              :       i = PHI <0(scalar.preheader), i+1(scalar.body)>
    8676              :       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
    8677              :       _1 = a[i]
    8678              :       b[i] = _1 - _2
    8679              :       if (i < n) goto scalar.body
    8680              : 
    8681              :    In this example, _2 is a recurrence because it's value depends on the
    8682              :    previous iteration.  We vectorize this as (VF = 4)
    8683              : 
    8684              :     vector.preheader:
    8685              :       vect_init = vect_cst(..., ..., ..., 0)
    8686              : 
    8687              :     vector.body
    8688              :       i = PHI <0(vector.preheader), i+4(vector.body)>
    8689              :       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
    8690              :       vect_2 = a[i, i+1, i+2, i+3];
    8691              :       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
    8692              :       b[i, i+1, i+2, i+3] = vect_2 - vect_3
    8693              :       if (..) goto vector.body
    8694              : 
    8695              :    In this function, vectorizable_recurr, we code generate both the
    8696              :    vector PHI node and the permute since those together compute the
    8697              :    vectorized value of the scalar PHI.  We do not yet have the
    8698              :    backedge value to fill in there nor into the vec_perm.  Those
    8699              :    are filled in vect_schedule_scc.
    8700              : 
    8701              :    TODO:  Since the scalar loop does not have a use of the recurrence
    8702              :    outside of the loop the natural way to implement peeling via
    8703              :    vectorizing the live value doesn't work.  For now peeling of loops
    8704              :    with a recurrence is not implemented.  For SLP the supported cases
    8705              :    are restricted to those requiring a single vector recurrence PHI.  */
    8706              : 
    8707              : bool
    8708       181057 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
    8709              :                      slp_tree slp_node, stmt_vector_for_cost *cost_vec)
    8710              : {
    8711       181057 :   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
    8712              :     return false;
    8713              : 
    8714        35065 :   gphi *phi = as_a<gphi *> (stmt_info->stmt);
    8715              : 
    8716              :   /* So far we only support first-order recurrence auto-vectorization.  */
    8717        35065 :   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
    8718              :     return false;
    8719              : 
    8720          418 :   tree vectype = SLP_TREE_VECTYPE (slp_node);
    8721          418 :   unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
    8722          418 :   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
    8723          418 :   unsigned dist = SLP_TREE_LANES (slp_node);
    8724              :   /* We need to be able to make progress with a single vector.  */
    8725          418 :   if (maybe_gt (dist * 2, nunits))
    8726              :     {
    8727            0 :       if (dump_enabled_p ())
    8728            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    8729              :                          "first order recurrence exceeds half of "
    8730              :                          "a vector\n");
    8731            0 :       return false;
    8732              :     }
    8733              : 
    8734              :   /* We need to be able to build a { ..., a, b } init vector with
    8735              :      dist number of distinct trailing values.  Always possible
    8736              :      when dist == 1 or when nunits is constant or when the initializations
    8737              :      are uniform.  */
    8738          418 :   tree uniform_initval = NULL_TREE;
    8739          418 :   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
    8740         1696 :   for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
    8741              :     {
    8742          454 :       gphi *phi = as_a <gphi *> (s->stmt);
    8743          454 :       if (! uniform_initval)
    8744          418 :         uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
    8745           36 :       else if (! operand_equal_p (uniform_initval,
    8746           36 :                                   PHI_ARG_DEF_FROM_EDGE (phi, pe)))
    8747              :         {
    8748              :           uniform_initval = NULL_TREE;
    8749              :           break;
    8750              :         }
    8751              :     }
    8752          418 :   if (!uniform_initval && !nunits.is_constant ())
    8753              :     {
    8754              :       if (dump_enabled_p ())
    8755              :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    8756              :                          "cannot build initialization vector for "
    8757              :                          "first order recurrence\n");
    8758              :       return false;
    8759              :     }
    8760              : 
    8761              :   /* First-order recurrence autovectorization needs to handle permutation
    8762              :      with indices = [nunits-1, nunits, nunits+1, ...].  */
    8763          418 :   vec_perm_builder sel (nunits, 1, 3);
    8764         1672 :   for (int i = 0; i < 3; ++i)
    8765         1254 :     sel.quick_push (nunits - dist + i);
    8766          418 :   vec_perm_indices indices (sel, 2, nunits);
    8767              : 
    8768          418 :   if (cost_vec) /* transformation not required.  */
    8769              :     {
    8770          373 :       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
    8771              :                                  indices))
    8772              :         return false;
    8773              : 
    8774              :       /* We eventually need to set a vector type on invariant
    8775              :          arguments.  */
    8776              :       unsigned j;
    8777              :       slp_tree child;
    8778          783 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
    8779          522 :         if (!vect_maybe_update_slp_op_vectype (child, vectype))
    8780              :           {
    8781            0 :             if (dump_enabled_p ())
    8782            0 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    8783              :                                "incompatible vector types for "
    8784              :                                "invariants\n");
    8785            0 :             return false;
    8786              :           }
    8787              : 
    8788              :       /* Verify we have set up compatible types.  */
    8789          261 :       edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
    8790          261 :       slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
    8791          261 :       tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
    8792          261 :       if (!types_compatible_p (latch_vectype, vectype))
    8793              :         return false;
    8794              : 
    8795              :       /* The recurrence costs the initialization vector and one permute
    8796              :          for each copy.  With SLP the prologue value is explicitly
    8797              :          represented and costed separately.  */
    8798          261 :       unsigned prologue_cost = 0;
    8799          261 :       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
    8800              :                                                slp_node, 0, vect_body);
    8801          261 :       if (dump_enabled_p ())
    8802           53 :         dump_printf_loc (MSG_NOTE, vect_location,
    8803              :                          "vectorizable_recurr: inside_cost = %d, "
    8804              :                          "prologue_cost = %d .\n", inside_cost,
    8805              :                          prologue_cost);
    8806              : 
    8807          261 :       SLP_TREE_TYPE (slp_node) = recurr_info_type;
    8808          261 :       return true;
    8809              :     }
    8810              : 
    8811           45 :   tree vec_init;
    8812           45 :   if (! uniform_initval)
    8813              :     {
    8814            6 :       vec<constructor_elt, va_gc> *v = NULL;
    8815            6 :       vec_alloc (v, nunits.to_constant ());
    8816           33 :       for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
    8817           27 :         CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
    8818              :                                 build_zero_cst (TREE_TYPE (vectype)));
    8819           39 :       for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
    8820              :         {
    8821           21 :           gphi *phi = as_a <gphi *> (s->stmt);
    8822           21 :           tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
    8823           21 :           if (!useless_type_conversion_p (TREE_TYPE (vectype),
    8824           21 :                                           TREE_TYPE (preheader)))
    8825              :             {
    8826            0 :               gimple_seq stmts = NULL;
    8827            0 :               preheader = gimple_convert (&stmts,
    8828            0 :                                           TREE_TYPE (vectype), preheader);
    8829            0 :               gsi_insert_seq_on_edge_immediate (pe, stmts);
    8830              :             }
    8831           21 :           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
    8832              :         }
    8833            6 :       vec_init = build_constructor (vectype, v);
    8834              :     }
    8835              :   else
    8836              :     vec_init = uniform_initval;
    8837           45 :   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
    8838              : 
    8839              :   /* Create the vectorized first-order PHI node.  */
    8840           45 :   tree vec_dest = vect_get_new_vect_var (vectype,
    8841              :                                          vect_simple_var, "vec_recur_");
    8842           45 :   basic_block bb = gimple_bb (phi);
    8843           45 :   gphi *new_phi = create_phi_node (vec_dest, bb);
    8844           45 :   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
    8845              : 
    8846              :   /* Insert shuffles the first-order recurrence autovectorization.
    8847              :        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
    8848           45 :   tree perm = vect_gen_perm_mask_checked (vectype, indices);
    8849              : 
    8850              :   /* Insert the required permute after the latch definition.  The
    8851              :      second and later operands are tentative and will be updated when we have
    8852              :      vectorized the latch definition.  */
    8853           45 :   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
    8854           45 :   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
    8855           45 :   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
    8856           53 :   do
    8857              :     {
    8858           53 :       gsi_next (&gsi2);
    8859              :     }
    8860              :   /* Skip inserted vectorized stmts for the latch definition.  We have to
    8861              :      insert after those.  */
    8862           98 :   while (gsi_stmt (gsi2) && gimple_uid (gsi_stmt (gsi2)) == 0);
    8863              : 
    8864          127 :   for (unsigned i = 0; i < ncopies; ++i)
    8865              :     {
    8866           82 :       vec_dest = make_ssa_name (vectype);
    8867           82 :       gassign *vperm
    8868          127 :           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
    8869           45 :                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
    8870              :                                  NULL, perm);
    8871           82 :       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
    8872              : 
    8873           82 :       slp_node->push_vec_def (vperm);
    8874              :     }
    8875              : 
    8876              :   return true;
    8877          418 : }
    8878              : 
    8879              : /* Return true if VECTYPE represents a vector that requires lowering
    8880              :    by the vector lowering pass.  */
    8881              : 
    8882              : bool
    8883       781584 : vect_emulated_vector_p (tree vectype)
    8884              : {
    8885      1563168 :   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
    8886       785673 :           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
    8887         4071 :               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
    8888              : }
    8889              : 
    8890              : /* Return true if we can emulate CODE on an integer mode representation
    8891              :    of a vector.  */
    8892              : 
    8893              : bool
    8894        11763 : vect_can_vectorize_without_simd_p (tree_code code)
    8895              : {
    8896        11763 :   switch (code)
    8897              :     {
    8898              :     case PLUS_EXPR:
    8899              :     case MINUS_EXPR:
    8900              :     case NEGATE_EXPR:
    8901              :     case BIT_AND_EXPR:
    8902              :     case BIT_IOR_EXPR:
    8903              :     case BIT_XOR_EXPR:
    8904              :     case BIT_NOT_EXPR:
    8905              :       return true;
    8906              : 
    8907        11198 :     default:
    8908        11198 :       return false;
    8909              :     }
    8910              : }
    8911              : 
    8912              : /* Likewise, but taking a code_helper.  */
    8913              : 
    8914              : bool
    8915          992 : vect_can_vectorize_without_simd_p (code_helper code)
    8916              : {
    8917          992 :   return (code.is_tree_code ()
    8918          992 :           && vect_can_vectorize_without_simd_p (tree_code (code)));
    8919              : }
    8920              : 
    8921              : /* Create vector init for vectorized iv.  */
    8922              : static tree
    8923          916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
    8924              :                                tree step_expr, poly_uint64 nunits,
    8925              :                                tree vectype,
    8926              :                                enum vect_induction_op_type induction_type)
    8927              : {
    8928          916 :   unsigned HOST_WIDE_INT const_nunits;
    8929          916 :   tree vec_shift, vec_init, new_name;
    8930          916 :   unsigned i;
    8931          916 :   tree itype = TREE_TYPE (vectype);
    8932              : 
    8933              :   /* iv_loop is the loop to be vectorized. Create:
    8934              :      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
    8935          916 :   new_name = gimple_convert (stmts, itype, init_expr);
    8936          916 :   switch (induction_type)
    8937              :     {
    8938           18 :     case vect_step_op_shr:
    8939           18 :     case vect_step_op_shl:
    8940              :       /* Build the Initial value from shift_expr.  */
    8941           18 :       vec_init = gimple_build_vector_from_val (stmts,
    8942              :                                                vectype,
    8943              :                                                new_name);
    8944           18 :       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
    8945              :                                 build_zero_cst (itype), step_expr);
    8946           18 :       vec_init = gimple_build (stmts,
    8947              :                                (induction_type == vect_step_op_shr
    8948              :                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
    8949              :                                vectype, vec_init, vec_shift);
    8950           18 :       break;
    8951              : 
    8952          822 :     case vect_step_op_neg:
    8953          822 :       {
    8954          822 :         vec_init = gimple_build_vector_from_val (stmts,
    8955              :                                                  vectype,
    8956              :                                                  new_name);
    8957          822 :         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
    8958              :                                      vectype, vec_init);
    8959              :         /* The encoding has 2 interleaved stepped patterns.  */
    8960          822 :         vec_perm_builder sel (nunits, 2, 3);
    8961          822 :         sel.quick_grow (6);
    8962         4110 :         for (i = 0; i < 3; i++)
    8963              :           {
    8964         2466 :             sel[2 * i] = i;
    8965         2466 :             sel[2 * i + 1] = i + nunits;
    8966              :           }
    8967          822 :         vec_perm_indices indices (sel, 2, nunits);
    8968              :         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
    8969              :            fail when vec_init is const vector. In that situation vec_perm is not
    8970              :            really needed.  */
    8971          822 :         tree perm_mask_even
    8972          822 :           = vect_gen_perm_mask_any (vectype, indices);
    8973          822 :         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
    8974              :                                  vectype,
    8975              :                                  vec_init, vec_neg,
    8976              :                                  perm_mask_even);
    8977          822 :       }
    8978          822 :       break;
    8979              : 
    8980           76 :     case vect_step_op_mul:
    8981           76 :       {
    8982              :         /* Use unsigned mult to avoid UD integer overflow.  */
    8983           76 :         gcc_assert (nunits.is_constant (&const_nunits));
    8984           76 :         tree utype = unsigned_type_for (itype);
    8985           76 :         tree uvectype = build_vector_type (utype,
    8986           76 :                                            TYPE_VECTOR_SUBPARTS (vectype));
    8987           76 :         new_name = gimple_convert (stmts, utype, new_name);
    8988           76 :         vec_init = gimple_build_vector_from_val (stmts,
    8989              :                                                  uvectype,
    8990              :                                                  new_name);
    8991           76 :         tree_vector_builder elts (uvectype, const_nunits, 1);
    8992           76 :         tree elt_step = build_one_cst (utype);
    8993              : 
    8994           76 :         elts.quick_push (elt_step);
    8995          660 :         for (i = 1; i < const_nunits; i++)
    8996              :           {
    8997              :             /* Create: new_name_i = new_name + step_expr.  */
    8998          508 :             elt_step = gimple_build (stmts, MULT_EXPR,
    8999              :                                      utype, elt_step, step_expr);
    9000          508 :             elts.quick_push (elt_step);
    9001              :           }
    9002              :         /* Create a vector from [new_name_0, new_name_1, ...,
    9003              :            new_name_nunits-1].  */
    9004           76 :         tree vec_mul = gimple_build_vector (stmts, &elts);
    9005           76 :         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
    9006              :                                  vec_init, vec_mul);
    9007           76 :         vec_init = gimple_convert (stmts, vectype, vec_init);
    9008           76 :       }
    9009           76 :       break;
    9010              : 
    9011            0 :     default:
    9012            0 :       gcc_unreachable ();
    9013              :     }
    9014              : 
    9015          916 :   return vec_init;
    9016              : }
    9017              : 
    9018              : /* Peel init_expr by skip_niter for induction_type.  */
    9019              : tree
    9020           84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
    9021              :                              tree skip_niters, tree step_expr,
    9022              :                              enum vect_induction_op_type induction_type,
    9023              :                              bool early_exit_p)
    9024              : {
    9025           84 :   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST || early_exit_p);
    9026           84 :   tree type = TREE_TYPE (init_expr);
    9027           84 :   unsigned prec = TYPE_PRECISION (type);
    9028           84 :   switch (induction_type)
    9029              :     {
    9030              :     /* neg inductions are typically not used for loop termination conditions but
    9031              :        are typically implemented as b = -b.  That is every scalar iteration b is
    9032              :        negated.  That means that for the initial value of b we will have to
    9033              :        determine whether the number of skipped iteration is a multiple of 2
    9034              :        because every 2 scalar iterations we are back at "b".  */
    9035            0 :     case vect_step_op_neg:
    9036              :       /* For early exits the neg induction will always be the same value at the
    9037              :          start of the iteration.  */
    9038            0 :       if (early_exit_p)
    9039              :         break;
    9040              : 
    9041            0 :       if (TREE_INT_CST_LOW (skip_niters) % 2)
    9042            0 :         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
    9043              :       /* else no change.  */
    9044              :       break;
    9045              : 
    9046           12 :     case vect_step_op_shr:
    9047           12 :     case vect_step_op_shl:
    9048           12 :       skip_niters = fold_build1 (NOP_EXPR, type, skip_niters);
    9049           12 :       step_expr = fold_build1 (NOP_EXPR, type, step_expr);
    9050           12 :       step_expr = fold_build2 (MULT_EXPR, type, step_expr, skip_niters);
    9051              :       /* When shift mount >= precision, need to avoid UD.
    9052              :          In the original loop, there's no UD, and according to semantic,
    9053              :          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
    9054           12 :       if ((!tree_fits_uhwi_p (step_expr)
    9055           12 :           || tree_to_uhwi (step_expr) >= prec)
    9056            6 :           && !early_exit_p)
    9057              :         {
    9058            6 :           if (induction_type == vect_step_op_shl
    9059            6 :               || TYPE_UNSIGNED (type))
    9060            4 :             init_expr = build_zero_cst (type);
    9061              :           else
    9062            2 :             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
    9063              :                                       init_expr,
    9064            4 :                                       wide_int_to_tree (type, prec - 1));
    9065              :         }
    9066              :       else
    9067              :         {
    9068            8 :           init_expr = fold_build2 ((induction_type == vect_step_op_shr
    9069              :                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
    9070              :                                     type, init_expr, step_expr);
    9071            6 :           init_expr = force_gimple_operand (init_expr, stmts, false, NULL);
    9072              :         }
    9073              :       break;
    9074              : 
    9075           72 :     case vect_step_op_mul:
    9076           72 :       {
    9077              :         /* Due to UB we can't support vect_step_op_mul with early break for now.
    9078              :            so assert and block.  */
    9079           72 :         gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
    9080           72 :         tree utype = unsigned_type_for (type);
    9081           72 :         init_expr = gimple_convert (stmts, utype, init_expr);
    9082           72 :         wide_int skipn = wi::to_wide (skip_niters);
    9083           72 :         wide_int begin = wi::to_wide (step_expr);
    9084           72 :         auto_mpz base, exp, mod, res;
    9085           72 :         wi::to_mpz (begin, base, TYPE_SIGN (type));
    9086           72 :         wi::to_mpz (skipn, exp, UNSIGNED);
    9087           72 :         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
    9088           72 :         mpz_powm (res, base, exp, mod);
    9089           72 :         begin = wi::from_mpz (utype, res, true);
    9090           72 :         tree mult_expr = wide_int_to_tree (utype, begin);
    9091           72 :         init_expr = gimple_build (stmts, MULT_EXPR, utype,
    9092              :                                   init_expr, mult_expr);
    9093           72 :         init_expr = gimple_convert (stmts, type, init_expr);
    9094           72 :       }
    9095           72 :       break;
    9096              : 
    9097            0 :     default:
    9098            0 :       gcc_unreachable ();
    9099              :     }
    9100              : 
    9101           84 :   return init_expr;
    9102              : }
    9103              : 
    9104              : /* Create vector step for vectorized iv.  */
    9105              : static tree
    9106         1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
    9107              :                                poly_uint64 vf,
    9108              :                                enum vect_induction_op_type induction_type)
    9109              : {
    9110         1202 :   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
    9111         1202 :   tree new_name = NULL;
    9112              :   /* Step should be pow (step, vf) for mult induction.  */
    9113         1202 :   if (induction_type == vect_step_op_mul)
    9114              :     {
    9115           76 :       gcc_assert (vf.is_constant ());
    9116           76 :       wide_int begin = wi::to_wide (step_expr);
    9117              : 
    9118          584 :       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
    9119          508 :         begin = wi::mul (begin, wi::to_wide (step_expr));
    9120              : 
    9121           76 :       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
    9122           76 :     }
    9123         1126 :   else if (induction_type == vect_step_op_neg)
    9124              :     /* Do nothing.  */
    9125              :     ;
    9126              :   else
    9127           18 :     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
    9128              :                              expr, step_expr);
    9129         1202 :   return new_name;
    9130              : }
    9131              : 
    9132              : static tree
    9133         1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
    9134              :                                    stmt_vec_info stmt_info,
    9135              :                                    tree new_name, tree vectype,
    9136              :                                    enum vect_induction_op_type induction_type)
    9137              : {
    9138              :   /* No step is needed for neg induction.  */
    9139         1202 :   if (induction_type == vect_step_op_neg)
    9140              :     return NULL;
    9141              : 
    9142           94 :   tree t = unshare_expr (new_name);
    9143           94 :   gcc_assert (CONSTANT_CLASS_P (new_name)
    9144              :               || TREE_CODE (new_name) == SSA_NAME);
    9145           94 :   tree new_vec = build_vector_from_val (vectype, t);
    9146           94 :   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
    9147              :                                     new_vec, vectype, NULL);
    9148           94 :   return vec_step;
    9149              : }
    9150              : 
    9151              : /* Update vectorized iv with vect_step, induc_def is init.  */
    9152              : static tree
    9153         1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
    9154              :                           tree induc_def, tree vec_step,
    9155              :                           enum vect_induction_op_type induction_type)
    9156              : {
    9157         1390 :   tree vec_def = induc_def;
    9158         1390 :   switch (induction_type)
    9159              :     {
    9160           76 :     case vect_step_op_mul:
    9161           76 :       {
    9162              :         /* Use unsigned mult to avoid UD integer overflow.  */
    9163           76 :         tree uvectype = unsigned_type_for (vectype);
    9164           76 :         vec_def = gimple_convert (stmts, uvectype, vec_def);
    9165           76 :         vec_step = gimple_convert (stmts, uvectype, vec_step);
    9166           76 :         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
    9167              :                                 vec_def, vec_step);
    9168           76 :         vec_def = gimple_convert (stmts, vectype, vec_def);
    9169              :       }
    9170           76 :       break;
    9171              : 
    9172           12 :     case vect_step_op_shr:
    9173           12 :       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
    9174              :                               vec_def, vec_step);
    9175           12 :       break;
    9176              : 
    9177            6 :     case vect_step_op_shl:
    9178            6 :       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
    9179              :                               vec_def, vec_step);
    9180            6 :       break;
    9181              :     case vect_step_op_neg:
    9182              :       vec_def = induc_def;
    9183              :       /* Do nothing.  */
    9184              :       break;
    9185            0 :     default:
    9186            0 :       gcc_unreachable ();
    9187              :     }
    9188              : 
    9189         1390 :   return vec_def;
    9190              : 
    9191              : }
    9192              : 
    9193              : /* Function vectorizable_nonlinear_induction
    9194              : 
    9195              :    Check if STMT_INFO performs an nonlinear induction computation that can be
    9196              :    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
    9197              :    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
    9198              :    basic block.
    9199              :    Return true if STMT_INFO is vectorizable in this way.  */
    9200              : 
    9201              : static bool
    9202         9198 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
    9203              :                                   stmt_vec_info stmt_info,
    9204              :                                   slp_tree slp_node,
    9205              :                                   stmt_vector_for_cost *cost_vec)
    9206              : {
    9207         9198 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    9208         9198 :   unsigned ncopies;
    9209         9198 :   bool nested_in_vect_loop = false;
    9210         9198 :   class loop *iv_loop;
    9211         9198 :   tree vec_def;
    9212         9198 :   edge pe = loop_preheader_edge (loop);
    9213         9198 :   basic_block new_bb;
    9214         9198 :   tree vec_init, vec_step;
    9215         9198 :   tree new_name;
    9216         9198 :   gimple *new_stmt;
    9217         9198 :   gphi *induction_phi;
    9218         9198 :   tree induc_def, vec_dest;
    9219         9198 :   tree init_expr, step_expr;
    9220         9198 :   tree niters_skip;
    9221         9198 :   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    9222         9198 :   unsigned i;
    9223         9198 :   gimple_stmt_iterator si;
    9224              : 
    9225         9198 :   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
    9226              : 
    9227         9198 :   tree vectype = SLP_TREE_VECTYPE (slp_node);
    9228         9198 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
    9229         9198 :   enum vect_induction_op_type induction_type
    9230              :     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
    9231              : 
    9232         9198 :   gcc_assert (induction_type > vect_step_op_add);
    9233              : 
    9234         9198 :   ncopies = vect_get_num_copies (loop_vinfo, slp_node);
    9235         9198 :   gcc_assert (ncopies >= 1);
    9236              : 
    9237              :   /* FORNOW. Only handle nonlinear induction in the same loop.  */
    9238         9198 :   if (nested_in_vect_loop_p (loop, stmt_info))
    9239              :     {
    9240            0 :       if (dump_enabled_p ())
    9241            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9242              :                          "nonlinear induction in nested loop.\n");
    9243            0 :       return false;
    9244              :     }
    9245              : 
    9246         9198 :   iv_loop = loop;
    9247         9198 :   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
    9248              : 
    9249              :   /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
    9250              :      vector iv update for each iv and a permutation to generate wanted
    9251              :      vector iv.  */
    9252         9198 :   if (SLP_TREE_LANES (slp_node) > 1)
    9253              :     {
    9254            0 :       if (dump_enabled_p ())
    9255            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9256              :                          "SLP induction not supported for nonlinear"
    9257              :                          " induction.\n");
    9258            0 :       return false;
    9259              :     }
    9260              : 
    9261         9198 :   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
    9262              :     {
    9263            0 :       if (dump_enabled_p ())
    9264            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9265              :                          "floating point nonlinear induction vectorization"
    9266              :                          " not supported.\n");
    9267            0 :       return false;
    9268              :     }
    9269              : 
    9270         9198 :   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
    9271         9198 :   init_expr = vect_phi_initial_value (phi);
    9272         9198 :   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
    9273              :               && TREE_CODE (step_expr) == INTEGER_CST);
    9274              :   /* step_expr should be aligned with init_expr,
    9275              :      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
    9276         9198 :   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
    9277              : 
    9278         9198 :   if (TREE_CODE (init_expr) == INTEGER_CST)
    9279         4085 :     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
    9280         5113 :   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
    9281              :     {
    9282              :       /* INIT_EXPR could be a bit_field, bail out for such case.  */
    9283            4 :       if (dump_enabled_p ())
    9284            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9285              :                          "nonlinear induction vectorization failed:"
    9286              :                          " component type of vectype is not a nop conversion"
    9287              :                          " from type of init_expr.\n");
    9288            4 :       return false;
    9289              :     }
    9290              : 
    9291         9194 :   switch (induction_type)
    9292              :     {
    9293         3714 :     case vect_step_op_neg:
    9294         3714 :       if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
    9295              :         return false;
    9296         3552 :       if (TREE_CODE (init_expr) != INTEGER_CST
    9297          282 :           && TREE_CODE (init_expr) != REAL_CST)
    9298              :         {
    9299              :           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
    9300          282 :           if (!directly_supported_p (NEGATE_EXPR, vectype))
    9301            0 :             return false;
    9302              : 
    9303              :           /* The encoding has 2 interleaved stepped patterns.  */
    9304          282 :           vec_perm_builder sel (nunits, 2, 3);
    9305          282 :           machine_mode mode = TYPE_MODE (vectype);
    9306          282 :           sel.quick_grow (6);
    9307         1410 :           for (i = 0; i < 3; i++)
    9308              :             {
    9309          846 :               sel[i * 2] = i;
    9310          846 :               sel[i * 2 + 1] = i + nunits;
    9311              :             }
    9312          282 :           vec_perm_indices indices (sel, 2, nunits);
    9313          282 :           if (!can_vec_perm_const_p (mode, mode, indices))
    9314            0 :             return false;
    9315          282 :         }
    9316              :       break;
    9317              : 
    9318         1058 :     case vect_step_op_mul:
    9319         1058 :       {
    9320              :         /* Check for backend support of MULT_EXPR.  */
    9321         1058 :         if (!directly_supported_p (MULT_EXPR, vectype))
    9322              :           return false;
    9323              : 
    9324              :         /* ?? How to construct vector step for variable number vector.
    9325              :            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
    9326              :         if (!vf.is_constant ())
    9327              :           return false;
    9328              :       }
    9329              :       break;
    9330              : 
    9331         4104 :     case vect_step_op_shr:
    9332              :       /* Check for backend support of RSHIFT_EXPR.  */
    9333         4104 :       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
    9334              :         return false;
    9335              : 
    9336              :       /* Don't shift more than type precision to avoid UD.  */
    9337           26 :       if (!tree_fits_uhwi_p (step_expr)
    9338           26 :           || maybe_ge (nunits * tree_to_uhwi (step_expr),
    9339              :                        TYPE_PRECISION (TREE_TYPE (init_expr))))
    9340              :         return false;
    9341              :       break;
    9342              : 
    9343          318 :     case vect_step_op_shl:
    9344              :       /* Check for backend support of RSHIFT_EXPR.  */
    9345          318 :       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
    9346              :         return false;
    9347              : 
    9348              :       /* Don't shift more than type precision to avoid UD.  */
    9349           12 :       if (!tree_fits_uhwi_p (step_expr)
    9350           12 :           || maybe_ge (nunits * tree_to_uhwi (step_expr),
    9351              :                        TYPE_PRECISION (TREE_TYPE (init_expr))))
    9352              :         return false;
    9353              : 
    9354              :       break;
    9355              : 
    9356            0 :     default:
    9357            0 :       gcc_unreachable ();
    9358              :     }
    9359              : 
    9360         4412 :   if (cost_vec) /* transformation not required.  */
    9361              :     {
    9362         3496 :       unsigned inside_cost = 0, prologue_cost = 0;
    9363              :       /* loop cost for vec_loop. Neg induction doesn't have any
    9364              :          inside_cost.  */
    9365         3496 :       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
    9366              :                                       slp_node, 0, vect_body);
    9367              : 
    9368              :       /* loop cost for vec_loop. Neg induction doesn't have any
    9369              :          inside_cost.  */
    9370         3496 :       if (induction_type == vect_step_op_neg)
    9371         2730 :         inside_cost = 0;
    9372              : 
    9373              :       /* prologue cost for vec_init and vec_step.  */
    9374         3496 :       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
    9375              :                                         slp_node, 0, vect_prologue);
    9376              : 
    9377         3496 :       if (dump_enabled_p ())
    9378           68 :         dump_printf_loc (MSG_NOTE, vect_location,
    9379              :                          "vect_model_induction_cost: inside_cost = %d, "
    9380              :                          "prologue_cost = %d. \n", inside_cost,
    9381              :                          prologue_cost);
    9382              : 
    9383         3496 :       SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
    9384         3496 :       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
    9385         3496 :       return true;
    9386              :     }
    9387              : 
    9388              :   /* Transform.  */
    9389              : 
    9390              :   /* Compute a vector variable, initialized with the first VF values of
    9391              :      the induction variable.  E.g., for an iv with IV_PHI='X' and
    9392              :      evolution S, for a vector of 4 units, we want to compute:
    9393              :      [X, X + S, X + 2*S, X + 3*S].  */
    9394              : 
    9395          916 :   if (dump_enabled_p ())
    9396           32 :     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
    9397              : 
    9398          916 :   pe = loop_preheader_edge (iv_loop);
    9399              :   /* Find the first insertion point in the BB.  */
    9400          916 :   basic_block bb = gimple_bb (phi);
    9401          916 :   si = gsi_after_labels (bb);
    9402              : 
    9403          916 :   gimple_seq stmts = NULL;
    9404              : 
    9405          916 :   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
    9406              :   /* If we are using the loop mask to "peel" for alignment then we need
    9407              :      to adjust the start value here.  */
    9408          916 :   if (niters_skip != NULL_TREE)
    9409            0 :     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
    9410              :                                              step_expr, induction_type, false);
    9411              : 
    9412          916 :   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
    9413              :                                             step_expr, nunits, vectype,
    9414              :                                             induction_type);
    9415          916 :   if (stmts)
    9416              :     {
    9417          162 :       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
    9418          162 :       gcc_assert (!new_bb);
    9419              :     }
    9420              : 
    9421          916 :   stmts = NULL;
    9422          916 :   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
    9423              :                                             vf, induction_type);
    9424          916 :   if (stmts)
    9425              :     {
    9426            0 :       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
    9427            0 :       gcc_assert (!new_bb);
    9428              :     }
    9429              : 
    9430          916 :   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
    9431              :                                                 new_name, vectype,
    9432              :                                                 induction_type);
    9433              :   /* Create the following def-use cycle:
    9434              :      loop prolog:
    9435              :      vec_init = ...
    9436              :      vec_step = ...
    9437              :      loop:
    9438              :      vec_iv = PHI <vec_init, vec_loop>
    9439              :      ...
    9440              :      STMT
    9441              :      ...
    9442              :      vec_loop = vec_iv + vec_step;  */
    9443              : 
    9444              :   /* Create the induction-phi that defines the induction-operand.  */
    9445          916 :   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
    9446          916 :   induction_phi = create_phi_node (vec_dest, iv_loop->header);
    9447          916 :   induc_def = PHI_RESULT (induction_phi);
    9448              : 
    9449              :   /* Create the iv update inside the loop.  */
    9450          916 :   stmts = NULL;
    9451          916 :   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
    9452              :                                       induc_def, vec_step,
    9453              :                                       induction_type);
    9454              : 
    9455          916 :   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
    9456          916 :   new_stmt = SSA_NAME_DEF_STMT (vec_def);
    9457              : 
    9458              :   /* Set the arguments of the phi node:  */
    9459          916 :   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
    9460          916 :   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
    9461              :                UNKNOWN_LOCATION);
    9462              : 
    9463          916 :   slp_node->push_vec_def (induction_phi);
    9464              : 
    9465              :   /* In case that vectorization factor (VF) is bigger than the number
    9466              :      of elements that we can fit in a vectype (nunits), we have to generate
    9467              :      more than one vector stmt - i.e - we need to "unroll" the
    9468              :      vector stmt by a factor VF/nunits.  For more details see documentation
    9469              :      in vectorizable_operation.  */
    9470              : 
    9471          916 :   if (ncopies > 1)
    9472              :     {
    9473          286 :       stmts = NULL;
    9474              :       /* FORNOW. This restriction should be relaxed.  */
    9475          286 :       gcc_assert (!nested_in_vect_loop);
    9476              : 
    9477          286 :       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
    9478              :                                                 nunits, induction_type);
    9479              : 
    9480          286 :       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
    9481              :                                                     new_name, vectype,
    9482              :                                                     induction_type);
    9483          286 :       vec_def = induc_def;
    9484         1046 :       for (i = 1; i < ncopies; i++)
    9485              :         {
    9486              :           /* vec_i = vec_prev + vec_step.  */
    9487          474 :           stmts = NULL;
    9488          474 :           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
    9489              :                                               vec_def, vec_step,
    9490              :                                               induction_type);
    9491          474 :           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
    9492          474 :           new_stmt = SSA_NAME_DEF_STMT (vec_def);
    9493          474 :           slp_node->push_vec_def (new_stmt);
    9494              :         }
    9495              :     }
    9496              : 
    9497          916 :   if (dump_enabled_p ())
    9498           64 :     dump_printf_loc (MSG_NOTE, vect_location,
    9499              :                      "transform induction: created def-use cycle: %G%G",
    9500           32 :                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
    9501              : 
    9502              :   return true;
    9503              : }
    9504              : 
    9505              : /* Function vectorizable_induction
    9506              : 
    9507              :    Check if STMT_INFO performs an induction computation that can be vectorized.
    9508              :    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
    9509              :    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
    9510              :    Return true if STMT_INFO is vectorizable in this way.  */
    9511              : 
    9512              : bool
    9513       316179 : vectorizable_induction (loop_vec_info loop_vinfo,
    9514              :                         stmt_vec_info stmt_info,
    9515              :                         slp_tree slp_node, stmt_vector_for_cost *cost_vec)
    9516              : {
    9517       316179 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    9518       316179 :   bool nested_in_vect_loop = false;
    9519       316179 :   class loop *iv_loop;
    9520       316179 :   tree vec_def;
    9521       316179 :   edge pe = loop_preheader_edge (loop);
    9522       316179 :   basic_block new_bb;
    9523       316179 :   tree vec_init = NULL_TREE, vec_step, t;
    9524       316179 :   tree new_name;
    9525       316179 :   gphi *induction_phi;
    9526       316179 :   tree induc_def, vec_dest;
    9527       316179 :   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    9528       316179 :   unsigned i;
    9529       316179 :   tree expr;
    9530       316179 :   tree index_vectype = NULL_TREE;
    9531       316179 :   gimple_stmt_iterator si;
    9532       316179 :   enum vect_induction_op_type induction_type
    9533              :     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
    9534              : 
    9535       347234 :   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
    9536       170187 :   if (!phi)
    9537              :     return false;
    9538              : 
    9539       170187 :   if (!STMT_VINFO_RELEVANT_P (stmt_info))
    9540              :     return false;
    9541              : 
    9542              :   /* Make sure it was recognized as induction computation.  */
    9543       170187 :   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
    9544              :     return false;
    9545              : 
    9546              :   /* Handle nonlinear induction in a separate place.  */
    9547       166130 :   if (induction_type != vect_step_op_add)
    9548         9198 :     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
    9549         9198 :                                              slp_node, cost_vec);
    9550              : 
    9551       156932 :   tree vectype = SLP_TREE_VECTYPE (slp_node);
    9552       156932 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
    9553              : 
    9554              :   /* FORNOW. These restrictions should be relaxed.  */
    9555       156932 :   if (nested_in_vect_loop_p (loop, stmt_info))
    9556              :     {
    9557          813 :       imm_use_iterator imm_iter;
    9558          813 :       use_operand_p use_p;
    9559          813 :       gimple *exit_phi;
    9560          813 :       edge latch_e;
    9561          813 :       tree loop_arg;
    9562              : 
    9563          813 :       exit_phi = NULL;
    9564          813 :       latch_e = loop_latch_edge (loop->inner);
    9565          813 :       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
    9566         2475 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
    9567              :         {
    9568          873 :           gimple *use_stmt = USE_STMT (use_p);
    9569          873 :           if (is_gimple_debug (use_stmt))
    9570           36 :             continue;
    9571              : 
    9572          837 :           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
    9573              :             {
    9574              :               exit_phi = use_stmt;
    9575              :               break;
    9576              :             }
    9577          813 :         }
    9578          813 :       if (exit_phi)
    9579              :         {
    9580           24 :           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
    9581           24 :           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
    9582            8 :                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
    9583              :             {
    9584           16 :               if (dump_enabled_p ())
    9585           16 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9586              :                                  "inner-loop induction only used outside "
    9587              :                                  "of the outer vectorized loop.\n");
    9588           16 :               return false;
    9589              :             }
    9590              :         }
    9591              : 
    9592          797 :       nested_in_vect_loop = true;
    9593          797 :       iv_loop = loop->inner;
    9594              :     }
    9595              :   else
    9596              :     iv_loop = loop;
    9597       156916 :   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
    9598              : 
    9599       156916 :   if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
    9600              :     {
    9601              :       /* The current SLP code creates the step value element-by-element.  */
    9602              :       if (dump_enabled_p ())
    9603              :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9604              :                          "SLP induction not supported for variable-length"
    9605              :                          " vectors.\n");
    9606              :       return false;
    9607              :     }
    9608              : 
    9609       156916 :   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
    9610              :     {
    9611           12 :       if (dump_enabled_p ())
    9612           12 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9613              :                          "floating point induction vectorization disabled\n");
    9614           12 :       return false;
    9615              :     }
    9616              : 
    9617       156904 :   tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
    9618       156904 :   gcc_assert (step_expr != NULL_TREE);
    9619       313784 :   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
    9620       313685 :       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
    9621              :     {
    9622           12 :       if (dump_enabled_p ())
    9623           12 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9624              :                          "bit-precision induction vectorization not "
    9625              :                          "supported.\n");
    9626           12 :       return false;
    9627              :     }
    9628       156892 :   tree stept = TREE_TYPE (step_expr);
    9629       156892 :   tree step_vectype = get_same_sized_vectype (stept, vectype);
    9630       156892 :   stept = TREE_TYPE (step_vectype);
    9631              : 
    9632              :   /* Check for target support of the vectorized arithmetic used here.  */
    9633       156892 :   if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
    9634       156892 :       || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
    9635        26958 :       return false;
    9636       129934 :   if (!nunits.is_constant ())
    9637              :     {
    9638              :       if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
    9639              :         return false;
    9640              :       /* FLOAT_EXPR when computing VEC_INIT for float inductions.  */
    9641              :       if (SCALAR_FLOAT_TYPE_P (stept))
    9642              :         {
    9643              :           tree index_type = build_nonstandard_integer_type
    9644              :                 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
    9645              : 
    9646              :           index_vectype = build_vector_type (index_type, nunits);
    9647              :           if (!can_float_p (TYPE_MODE (step_vectype),
    9648              :                             TYPE_MODE (index_vectype), 1))
    9649              :             return false;
    9650              :         }
    9651              :     }
    9652              : 
    9653       129934 :   unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
    9654       129934 :   if (cost_vec) /* transformation not required.  */
    9655              :     {
    9656       343617 :       unsigned inside_cost = 0, prologue_cost = 0;
    9657              :       /* We eventually need to set a vector type on invariant
    9658              :          arguments.  */
    9659              :       unsigned j;
    9660              :       slp_tree child;
    9661       343617 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
    9662       229078 :         if (!vect_maybe_update_slp_op_vectype
    9663       229078 :             (child, SLP_TREE_VECTYPE (slp_node)))
    9664              :           {
    9665            0 :             if (dump_enabled_p ())
    9666            0 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9667              :                                "incompatible vector types for "
    9668              :                                "invariants\n");
    9669            0 :             return false;
    9670              :           }
    9671              :       /* loop cost for vec_loop.  */
    9672       114539 :       inside_cost = record_stmt_cost (cost_vec, nvects,
    9673              :                                       vector_stmt, slp_node, 0, vect_body);
    9674              :       /* prologue cost for vec_init (if not nested) and step.  */
    9675       114539 :       prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
    9676              :                                         scalar_to_vec,
    9677              :                                         slp_node, 0, vect_prologue);
    9678       114539 :       if (dump_enabled_p ())
    9679         4088 :         dump_printf_loc (MSG_NOTE, vect_location,
    9680              :                          "vect_model_induction_cost: inside_cost = %d, "
    9681              :                          "prologue_cost = %d .\n", inside_cost,
    9682              :                          prologue_cost);
    9683              : 
    9684       114539 :       SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
    9685       114539 :       DUMP_VECT_SCOPE ("vectorizable_induction");
    9686       114539 :       return true;
    9687              :     }
    9688              : 
    9689              :   /* Transform.  */
    9690              : 
    9691              :   /* Compute a vector variable, initialized with the first VF values of
    9692              :      the induction variable.  E.g., for an iv with IV_PHI='X' and
    9693              :      evolution S, for a vector of 4 units, we want to compute:
    9694              :      [X, X + S, X + 2*S, X + 3*S].  */
    9695              : 
    9696        15395 :   if (dump_enabled_p ())
    9697         2791 :     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
    9698              : 
    9699        15395 :   pe = loop_preheader_edge (iv_loop);
    9700              :   /* Find the first insertion point in the BB.  */
    9701        15395 :   basic_block bb = gimple_bb (phi);
    9702        15395 :   si = gsi_after_labels (bb);
    9703              : 
    9704              :   /* For SLP induction we have to generate several IVs as for example
    9705              :      with group size 3 we need
    9706              :        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
    9707              :        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
    9708        15395 :   gimple_stmt_iterator incr_si;
    9709        15395 :   bool insert_after;
    9710        15395 :   standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
    9711              : 
    9712              :   /* The initial values are vectorized, but any lanes > group_size
    9713              :      need adjustment.  */
    9714        15395 :   slp_tree init_node
    9715        15395 :       = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
    9716              : 
    9717              :   /* Gather steps.  Since we do not vectorize inductions as
    9718              :      cycles we have to reconstruct the step from SCEV data.  */
    9719        15395 :   unsigned group_size = SLP_TREE_LANES (slp_node);
    9720        15395 :   tree *steps = XALLOCAVEC (tree, group_size);
    9721        15395 :   tree *inits = XALLOCAVEC (tree, group_size);
    9722        15395 :   stmt_vec_info phi_info;
    9723        47459 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
    9724              :     {
    9725        16669 :       steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
    9726        16669 :       if (!init_node)
    9727        16424 :         inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
    9728              :                                        pe->dest_idx);
    9729              :     }
    9730              : 
    9731              :   /* Now generate the IVs.  */
    9732        30790 :   gcc_assert (multiple_p (nunits * nvects, group_size));
    9733        15395 :   unsigned nivs;
    9734        15395 :   unsigned HOST_WIDE_INT const_nunits;
    9735        15395 :   if (nested_in_vect_loop)
    9736              :     nivs = nvects;
    9737        15171 :   else if (nunits.is_constant (&const_nunits))
    9738              :     {
    9739              :       /* Compute the number of distinct IVs we need.  First reduce
    9740              :          group_size if it is a multiple of const_nunits so we get
    9741              :          one IV for a group_size of 4 but const_nunits 2.  */
    9742        15171 :       unsigned group_sizep = group_size;
    9743        15171 :       if (group_sizep % const_nunits == 0)
    9744          113 :         group_sizep = group_sizep / const_nunits;
    9745        15171 :       nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
    9746              :     }
    9747              :   else
    9748              :     {
    9749              :       gcc_assert (SLP_TREE_LANES (slp_node) == 1);
    9750              :       nivs = 1;
    9751              :     }
    9752        15395 :   gimple_seq init_stmts = NULL;
    9753        15395 :   tree lupdate_mul = NULL_TREE;
    9754          224 :   if (!nested_in_vect_loop)
    9755              :     {
    9756        15171 :       if (nunits.is_constant (&const_nunits))
    9757              :         {
    9758              :           /* The number of iterations covered in one vector iteration.  */
    9759        15171 :           unsigned lup_mul = (nvects * const_nunits) / group_size;
    9760        15171 :           lupdate_mul
    9761        15171 :             = build_vector_from_val (step_vectype,
    9762        15171 :                                      SCALAR_FLOAT_TYPE_P (stept)
    9763           28 :                                      ? build_real_from_wide (stept, lup_mul,
    9764              :                                                              UNSIGNED)
    9765        30314 :                                      : build_int_cstu (stept, lup_mul));
    9766              :         }
    9767              :       else
    9768              :         {
    9769              :           if (SCALAR_FLOAT_TYPE_P (stept))
    9770              :             {
    9771              :               tree tem = build_int_cst (integer_type_node, vf);
    9772              :               lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
    9773              :             }
    9774              :           else
    9775              :             lupdate_mul = build_int_cst (stept, vf);
    9776              :           lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
    9777              :                                                       lupdate_mul);
    9778              :         }
    9779              :     }
    9780        15395 :   tree peel_mul = NULL_TREE;
    9781        15395 :   if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
    9782              :     {
    9783            0 :       if (SCALAR_FLOAT_TYPE_P (stept))
    9784            0 :         peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
    9785              :                                  LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
    9786              :       else
    9787            0 :         peel_mul = gimple_convert (&init_stmts, stept,
    9788              :                                    LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
    9789            0 :       peel_mul = gimple_build_vector_from_val (&init_stmts,
    9790              :                                                step_vectype, peel_mul);
    9791              :     }
    9792        15395 :   tree step_mul = NULL_TREE;
    9793        15395 :   unsigned ivn;
    9794        15395 :   auto_vec<tree> vec_steps;
    9795        31366 :   for (ivn = 0; ivn < nivs; ++ivn)
    9796              :     {
    9797        15971 :       gimple_seq stmts = NULL;
    9798        15971 :       bool invariant = true;
    9799        15971 :       if (nunits.is_constant (&const_nunits))
    9800              :         {
    9801        15971 :           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
    9802        15971 :           tree_vector_builder init_elts (vectype, const_nunits, 1);
    9803        15971 :           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
    9804       102905 :           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
    9805              :             {
    9806              :               /* The scalar steps of the IVs.  */
    9807        86934 :               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
    9808        86934 :               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
    9809        86934 :               step_elts.quick_push (elt);
    9810        86934 :               if (!init_node)
    9811              :                 {
    9812              :                   /* The scalar inits of the IVs if not vectorized.  */
    9813        85672 :                   elt = inits[(ivn*const_nunits + eltn) % group_size];
    9814        85672 :                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
    9815        85672 :                                                   TREE_TYPE (elt)))
    9816          260 :                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
    9817          260 :                                         TREE_TYPE (vectype), elt);
    9818        85672 :                   init_elts.quick_push (elt);
    9819              :                 }
    9820              :               /* The number of steps to add to the initial values.  */
    9821        86934 :               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
    9822       173868 :               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
    9823       173766 :                                    ? build_real_from_wide (stept, mul_elt,
    9824              :                                                            UNSIGNED)
    9825       173766 :                                    : build_int_cstu (stept, mul_elt));
    9826              :             }
    9827        15971 :           vec_step = gimple_build_vector (&init_stmts, &step_elts);
    9828        15971 :           step_mul = gimple_build_vector (&init_stmts, &mul_elts);
    9829        15971 :           if (!init_node)
    9830        15713 :             vec_init = gimple_build_vector (&init_stmts, &init_elts);
    9831        15971 :         }
    9832              :       else
    9833              :         {
    9834              :           tree step = gimple_convert (&init_stmts, stept, steps[0]);
    9835              :           if (init_node)
    9836              :             ;
    9837              :           else if (INTEGRAL_TYPE_P (stept))
    9838              :             {
    9839              :               new_name = gimple_convert (&init_stmts, stept, inits[0]);
    9840              :               /* Build the initial value directly as a VEC_SERIES_EXPR.  */
    9841              :               vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
    9842              :                                        step_vectype, new_name, step);
    9843              :               if (!useless_type_conversion_p (vectype, step_vectype))
    9844              :                 vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
    9845              :                                          vectype, vec_init);
    9846              :             }
    9847              :           else
    9848              :             {
    9849              :               /* Build:
    9850              :                  [base, base, base, ...]
    9851              :                  + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
    9852              :               gcc_assert (SCALAR_FLOAT_TYPE_P (stept));
    9853              :               gcc_assert (flag_associative_math);
    9854              :               gcc_assert (index_vectype != NULL_TREE);
    9855              : 
    9856              :               tree index = build_index_vector (index_vectype, 0, 1);
    9857              :               new_name = gimple_convert (&init_stmts, stept, inits[0]);
    9858              :               tree base_vec = gimple_build_vector_from_val (&init_stmts,
    9859              :                                                             step_vectype,
    9860              :                                                             new_name);
    9861              :               tree step_vec = gimple_build_vector_from_val (&init_stmts,
    9862              :                                                             step_vectype,
    9863              :                                                             step);
    9864              :               vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
    9865              :                                        step_vectype, index);
    9866              :               vec_init = gimple_build (&init_stmts, MULT_EXPR,
    9867              :                                        step_vectype, vec_init, step_vec);
    9868              :               vec_init = gimple_build (&init_stmts, PLUS_EXPR,
    9869              :                                        step_vectype, vec_init, base_vec);
    9870              :               if (!useless_type_conversion_p (vectype, step_vectype))
    9871              :                 vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
    9872              :                                          vectype, vec_init);
    9873              :             }
    9874              :           /* iv_loop is nested in the loop to be vectorized. Generate:
    9875              :              vec_step = [S, S, S, S]  */
    9876              :           t = unshare_expr (step);
    9877              :           gcc_assert (CONSTANT_CLASS_P (t)
    9878              :                       || TREE_CODE (t) == SSA_NAME);
    9879              :           vec_step = gimple_build_vector_from_val (&init_stmts,
    9880              :                                                    step_vectype, t);
    9881              :         }
    9882        15971 :       vec_steps.safe_push (vec_step);
    9883        15971 :       if (peel_mul)
    9884              :         {
    9885            0 :           if (!step_mul)
    9886              :             {
    9887            0 :               gcc_assert (!nunits.is_constant ());
    9888              :               step_mul = gimple_build (&init_stmts,
    9889              :                                        MINUS_EXPR, step_vectype,
    9890              :                                        build_zero_cst (step_vectype), peel_mul);
    9891              :             }
    9892              :           else
    9893            0 :             step_mul = gimple_build (&init_stmts,
    9894              :                                      MINUS_EXPR, step_vectype,
    9895              :                                      step_mul, peel_mul);
    9896              :         }
    9897              : 
    9898              :       /* Create the induction-phi that defines the induction-operand.  */
    9899        15971 :       vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
    9900              :                                         "vec_iv_");
    9901        15971 :       induction_phi = create_phi_node (vec_dest, iv_loop->header);
    9902        15971 :       induc_def = PHI_RESULT (induction_phi);
    9903              : 
    9904              :       /* Create the iv update inside the loop  */
    9905        15971 :       tree up = vec_step;
    9906        15971 :       if (lupdate_mul)
    9907              :         {
    9908        15713 :           if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
    9909              :             {
    9910              :               /* When we're using loop_len produced by SELEC_VL, the
    9911              :                  non-final iterations are not always processing VF
    9912              :                  elements.  So vectorize induction variable instead of
    9913              : 
    9914              :                    _21 = vect_vec_iv_.6_22 + { VF, ... };
    9915              : 
    9916              :                  We should generate:
    9917              : 
    9918              :                    _35 = .SELECT_VL (ivtmp_33, VF);
    9919              :                    vect_cst__22 = [vec_duplicate_expr] _35;
    9920              :                    _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
    9921            0 :               vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
    9922            0 :               tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
    9923              :                                             vectype, 0, 0, false);
    9924            0 :               if (SCALAR_FLOAT_TYPE_P (stept))
    9925            0 :                 expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
    9926              :               else
    9927            0 :                 expr = gimple_convert (&stmts, stept, len);
    9928            0 :               lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
    9929              :                                                           expr);
    9930            0 :               up = gimple_build (&stmts, MULT_EXPR,
    9931              :                                  step_vectype, vec_step, lupdate_mul);
    9932              :             }
    9933              :           else
    9934        15713 :             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
    9935              :                                vec_step, lupdate_mul);
    9936              :         }
    9937        15971 :       vec_def = gimple_convert (&stmts, step_vectype, induc_def);
    9938        15971 :       vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
    9939        15971 :       vec_def = gimple_convert (&stmts, vectype, vec_def);
    9940        15971 :       insert_iv_increment (&incr_si, insert_after, stmts);
    9941        15971 :       add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
    9942              :                    UNKNOWN_LOCATION);
    9943              : 
    9944        15971 :       if (init_node)
    9945          258 :         vec_init = vect_get_slp_vect_def (init_node, ivn);
    9946        15971 :       if (!nested_in_vect_loop
    9947        15971 :           && step_mul
    9948        15971 :           && !integer_zerop (step_mul))
    9949              :         {
    9950        15266 :           gcc_assert (invariant);
    9951        15266 :           vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
    9952        15266 :           up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
    9953              :                              vec_step, step_mul);
    9954        15266 :           vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
    9955              :                                   vec_def, up);
    9956        15266 :           vec_init = gimple_convert (&init_stmts, vectype, vec_def);
    9957              :         }
    9958              : 
    9959              :       /* Set the arguments of the phi node:  */
    9960        15971 :       add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
    9961              : 
    9962        15971 :       slp_node->push_vec_def (induction_phi);
    9963              :     }
    9964        15395 :   if (!nested_in_vect_loop)
    9965              :     {
    9966              :       /* Fill up to the number of vectors we need for the whole group.  */
    9967        15171 :       if (nunits.is_constant (&const_nunits))
    9968        15171 :         nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
    9969              :       else
    9970              :         nivs = 1;
    9971        15171 :       vec_steps.reserve (nivs-ivn);
    9972        30369 :       for (; ivn < nivs; ++ivn)
    9973              :         {
    9974           27 :           slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
    9975           27 :           vec_steps.quick_push (vec_steps[0]);
    9976              :         }
    9977              :     }
    9978              : 
    9979              :   /* Re-use IVs when we can.  We are generating further vector
    9980              :      stmts by adding VF' * stride to the IVs generated above.  */
    9981        15395 :   if (ivn < nvects)
    9982              :     {
    9983         3390 :       if (nunits.is_constant (&const_nunits))
    9984              :         {
    9985         3390 :           unsigned vfp = (least_common_multiple (group_size, const_nunits)
    9986         3390 :                           / group_size);
    9987         3390 :           lupdate_mul
    9988         3390 :               = build_vector_from_val (step_vectype,
    9989         3390 :                                        SCALAR_FLOAT_TYPE_P (stept)
    9990            8 :                                        ? build_real_from_wide (stept,
    9991            8 :                                                                vfp, UNSIGNED)
    9992         6772 :                                        : build_int_cstu (stept, vfp));
    9993              :         }
    9994              :       else
    9995              :         {
    9996              :           if (SCALAR_FLOAT_TYPE_P (stept))
    9997              :             {
    9998              :               tree tem = build_int_cst (integer_type_node, nunits);
    9999              :               lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
   10000              :             }
   10001              :           else
   10002              :             lupdate_mul = build_int_cst (stept, nunits);
   10003              :           lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
   10004              :                                                       lupdate_mul);
   10005              :         }
   10006        10966 :       for (; ivn < nvects; ++ivn)
   10007              :         {
   10008         7576 :           gimple *iv
   10009         7576 :             = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
   10010         7576 :           tree def = gimple_get_lhs (iv);
   10011         7576 :           if (ivn < 2*nivs)
   10012         3488 :             vec_steps[ivn - nivs]
   10013         3488 :               = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
   10014         3488 :                               vec_steps[ivn - nivs], lupdate_mul);
   10015         7576 :           gimple_seq stmts = NULL;
   10016         7576 :           def = gimple_convert (&stmts, step_vectype, def);
   10017        22728 :           def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
   10018         7576 :                               def, vec_steps[ivn % nivs]);
   10019         7576 :           def = gimple_convert (&stmts, vectype, def);
   10020         7576 :           if (gimple_code (iv) == GIMPLE_PHI)
   10021         3488 :             gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
   10022              :           else
   10023              :             {
   10024         4088 :               gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
   10025         4088 :               gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
   10026              :             }
   10027         7576 :           slp_node->push_vec_def (def);
   10028              :         }
   10029              :     }
   10030              : 
   10031        15395 :   new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
   10032        15395 :   gcc_assert (!new_bb);
   10033              : 
   10034        15395 :   return true;
   10035        15395 : }
   10036              : 
   10037              : /* Function vectorizable_live_operation_1.
   10038              : 
   10039              :    helper function for vectorizable_live_operation.  */
   10040              : 
   10041              : static tree
   10042         2842 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
   10043              :                                tree vectype, slp_tree slp_node,
   10044              :                                tree bitsize, tree bitstart, tree vec_lhs,
   10045              :                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
   10046              : {
   10047         2842 :   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
   10048              : 
   10049         2842 :   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
   10050         2842 :   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
   10051         5686 :   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
   10052         2844 :     SET_PHI_ARG_DEF (phi, i, vec_lhs);
   10053              : 
   10054         2842 :   gimple_seq stmts = NULL;
   10055         2842 :   tree new_tree;
   10056              : 
   10057              :   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
   10058         2842 :   if (integer_zerop (bitstart))
   10059              :     {
   10060          217 :       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
   10061              :                                       vec_lhs_phi, bitsize, bitstart);
   10062              : 
   10063              :       /* Convert the extracted vector element to the scalar type.  */
   10064          217 :       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
   10065              :     }
   10066         2625 :   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
   10067              :     {
   10068              :       /* Emit:
   10069              : 
   10070              :          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>
   10071              : 
   10072              :          where VEC_LHS is the vectorized live-out result, LEN is the length of
   10073              :          the vector, BIAS is the load-store bias.  The bias should not be used
   10074              :          at all since we are not using load/store operations, but LEN will be
   10075              :          REALLEN + BIAS, so subtract it to get to the correct position.  */
   10076            0 :       gcc_assert (SLP_TREE_LANES (slp_node) == 1);
   10077            0 :       gimple_seq tem = NULL;
   10078            0 :       gimple_stmt_iterator gsi = gsi_last (tem);
   10079            0 :       tree len = vect_get_loop_len (loop_vinfo, &gsi,
   10080              :                                     &LOOP_VINFO_LENS (loop_vinfo),
   10081              :                                     1, vectype, 0, 1, false);
   10082            0 :       gimple_seq_add_seq (&stmts, tem);
   10083              : 
   10084              :       /* LAST_INDEX = LEN - 1.  */
   10085            0 :       tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
   10086            0 :                                      len, build_one_cst (TREE_TYPE (len)));
   10087              : 
   10088              :       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>.  */
   10089            0 :       tree scalar_res
   10090            0 :         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
   10091              :                         vec_lhs_phi, last_index);
   10092              : 
   10093              :       /* Convert the extracted vector element to the scalar type.  */
   10094            0 :       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
   10095              :     }
   10096         2625 :   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
   10097              :     {
   10098              :       /* Emit:
   10099              : 
   10100              :          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
   10101              : 
   10102              :          where VEC_LHS is the vectorized live-out result and MASK is
   10103              :          the loop mask for the final iteration.  */
   10104            0 :       gcc_assert (SLP_TREE_LANES (slp_node) == 1);
   10105            0 :       tree scalar_type = TREE_TYPE (vectype);
   10106            0 :       gimple_seq tem = NULL;
   10107            0 :       gimple_stmt_iterator gsi = gsi_last (tem);
   10108            0 :       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
   10109              :                                       &LOOP_VINFO_MASKS (loop_vinfo),
   10110              :                                       1, vectype, 0);
   10111            0 :       tree scalar_res;
   10112            0 :       gimple_seq_add_seq (&stmts, tem);
   10113              : 
   10114            0 :       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
   10115              :                                  mask, vec_lhs_phi);
   10116              : 
   10117              :       /* Convert the extracted vector element to the scalar type.  */
   10118            0 :       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
   10119              :     }
   10120              :   else
   10121              :     {
   10122         2625 :       tree bftype = TREE_TYPE (vectype);
   10123         2625 :       if (VECTOR_BOOLEAN_TYPE_P (vectype))
   10124           85 :         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
   10125         2625 :       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
   10126         2625 :       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
   10127              :                                        &stmts, true, NULL_TREE);
   10128              :     }
   10129              : 
   10130         2842 :   *exit_gsi = gsi_after_labels (exit_bb);
   10131         2842 :   if (stmts)
   10132         2842 :     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
   10133              : 
   10134         2842 :   return new_tree;
   10135              : }
   10136              : 
   10137              : /* Function vectorizable_live_operation.
   10138              : 
   10139              :    STMT_INFO computes a value that is used outside the loop.  Check if
   10140              :    it can be supported.  */
   10141              : 
   10142              : bool
   10143       261768 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   10144              :                              slp_tree slp_node, slp_instance slp_node_instance,
   10145              :                              int slp_index, bool vec_stmt_p,
   10146              :                              stmt_vector_for_cost *cost_vec)
   10147              : {
   10148       261768 :   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   10149       261768 :   imm_use_iterator imm_iter;
   10150       261768 :   tree lhs, lhs_type, bitsize;
   10151       261768 :   tree vectype = SLP_TREE_VECTYPE (slp_node);
   10152       261768 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   10153       261768 :   gimple *use_stmt;
   10154       261768 :   use_operand_p use_p;
   10155       261768 :   auto_vec<tree> vec_oprnds;
   10156       261768 :   int vec_entry = 0;
   10157       261768 :   poly_uint64 vec_index = 0;
   10158              : 
   10159       261768 :   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
   10160              :               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
   10161              : 
   10162              :   /* If a stmt of a reduction is live, vectorize it via
   10163              :      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
   10164              :      validity so just trigger the transform here.  */
   10165       261768 :   if (vect_is_reduction (slp_node))
   10166              :     {
   10167        87066 :       if (!vec_stmt_p)
   10168              :         {
   10169        63600 :           SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
   10170        63600 :           return true;
   10171              :         }
   10172              :       /* For SLP reductions we vectorize the epilogue for all involved stmts
   10173              :          together.  For SLP reduction chains we only get here once.  */
   10174        23466 :       if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
   10175        23195 :           && slp_index != 0)
   10176              :         return true;
   10177        23018 :       vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
   10178        23018 :       if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
   10179        23018 :           || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
   10180              :         return true;
   10181              : 
   10182        22123 :       if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
   10183        22123 :           || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
   10184        22114 :         vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
   10185              :                                           slp_node_instance,
   10186              :                                           LOOP_VINFO_MAIN_EXIT (loop_vinfo));
   10187              : 
   10188              :       /* If early break we only have to materialize the reduction on the merge
   10189              :          block, but we have to find an alternate exit first.  */
   10190        22123 :       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
   10191              :         {
   10192           28 :           slp_tree phis_node = slp_node_instance->reduc_phis;
   10193           28 :           stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
   10194           89 :           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
   10195           28 :             if (exit != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
   10196              :               {
   10197           23 :                 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
   10198              :                                                   phis_node, slp_node_instance,
   10199              :                                                   exit);
   10200           23 :                 break;
   10201           28 :               }
   10202           28 :           if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
   10203            9 :             vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
   10204              :                                               phis_node, slp_node_instance,
   10205              :                                               LOOP_VINFO_MAIN_EXIT
   10206              :                                               (loop_vinfo));
   10207              :         }
   10208              : 
   10209        22123 :       return true;
   10210              :     }
   10211              : 
   10212              :   /* If STMT is not relevant and it is a simple assignment and its inputs are
   10213              :      invariant then it can remain in place, unvectorized.  The original last
   10214              :      scalar value that it computes will be used.  */
   10215       174702 :   if (!STMT_VINFO_RELEVANT_P (stmt_info))
   10216              :     {
   10217            0 :       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
   10218            0 :       if (dump_enabled_p ())
   10219            0 :         dump_printf_loc (MSG_NOTE, vect_location,
   10220              :                          "statement is simple and uses invariant.  Leaving in "
   10221              :                          "place.\n");
   10222            0 :       return true;
   10223              :     }
   10224              : 
   10225       174702 :   gcc_assert (slp_index >= 0);
   10226              : 
   10227              :   /* Get the last occurrence of the scalar index from the concatenation of
   10228              :      all the slp vectors. Calculate which slp vector it is and the index
   10229              :      within.  */
   10230       174702 :   int num_scalar = SLP_TREE_LANES (slp_node);
   10231       174702 :   int num_vec = vect_get_num_copies (vinfo, slp_node);
   10232       174702 :   poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
   10233              : 
   10234              :   /* Calculate which vector contains the result, and which lane of
   10235              :      that vector we need.  */
   10236       174702 :   if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
   10237              :     {
   10238              :       if (dump_enabled_p ())
   10239              :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10240              :                          "Cannot determine which vector holds the"
   10241              :                          " final result.\n");
   10242              :       return false;
   10243              :     }
   10244              : 
   10245       174702 :   if (!vec_stmt_p)
   10246              :     {
   10247              :       /* No transformation required.  */
   10248       136424 :       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
   10249              :         {
   10250        27340 :           if (SLP_TREE_LANES (slp_node) != 1)
   10251              :             {
   10252           19 :               if (dump_enabled_p ())
   10253           19 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10254              :                                  "can't operate on partial vectors "
   10255              :                                  "because an SLP statement is live after "
   10256              :                                  "the loop.\n");
   10257           19 :               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
   10258              :             }
   10259        27321 :           else if (num_vec > 1)
   10260              :             {
   10261        15573 :               if (dump_enabled_p ())
   10262           53 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10263              :                                  "can't operate on partial vectors "
   10264              :                                  "because ncopies is greater than 1.\n");
   10265        15573 :               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
   10266              :             }
   10267              :           else
   10268              :             {
   10269        11748 :               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
   10270              :                                                   OPTIMIZE_FOR_SPEED))
   10271            0 :                 vect_record_loop_mask (loop_vinfo,
   10272              :                                        &LOOP_VINFO_MASKS (loop_vinfo),
   10273              :                                        1, vectype, NULL);
   10274        11748 :               else if (can_vec_extract_var_idx_p (
   10275        11748 :                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
   10276            0 :                 vect_record_loop_len (loop_vinfo,
   10277              :                                       &LOOP_VINFO_LENS (loop_vinfo),
   10278              :                                       1, vectype, 1);
   10279              :               else
   10280              :                 {
   10281        11748 :                   if (dump_enabled_p ())
   10282          655 :                     dump_printf_loc (
   10283          655 :                       MSG_MISSED_OPTIMIZATION, vect_location,
   10284              :                       "can't operate on partial vectors "
   10285              :                       "because the target doesn't support extract "
   10286              :                       "last reduction.\n");
   10287        11748 :                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
   10288              :                 }
   10289              :             }
   10290              :         }
   10291              :       /* ???  Enable for loop costing as well.  */
   10292        27340 :       if (!loop_vinfo)
   10293        64904 :         record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
   10294              :                           0, vect_epilogue);
   10295       136424 :       SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
   10296       136424 :       return true;
   10297              :     }
   10298              : 
   10299              :   /* Use the lhs of the original scalar statement.  */
   10300        38278 :   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
   10301        38278 :   if (dump_enabled_p ())
   10302          988 :     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
   10303              :                      "stmt %G", stmt);
   10304              : 
   10305        38278 :   lhs = gimple_get_lhs (stmt);
   10306        38278 :   lhs_type = TREE_TYPE (lhs);
   10307              : 
   10308        38278 :   bitsize = vector_element_bits_tree (vectype);
   10309              : 
   10310              :   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
   10311        38278 :   gcc_assert (!loop_vinfo
   10312              :               || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
   10313              :                    && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
   10314              :                   || SLP_TREE_LANES (slp_node) == 1));
   10315              : 
   10316              :   /* Get the correct slp vectorized stmt.  */
   10317        38278 :   tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
   10318        38278 :   gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
   10319              : 
   10320              :   /* In case we need to early break vectorize also get the first stmt.  */
   10321        38278 :   tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
   10322              : 
   10323              :   /* Get entry to use.  */
   10324        38278 :   tree bitstart = bitsize_int (vec_index);
   10325        38278 :   bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
   10326              : 
   10327        38278 :   if (loop_vinfo)
   10328              :     {
   10329              :       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
   10330              :          requirement, insert one phi node for it.  It looks like:
   10331              :            loop;
   10332              :          BB:
   10333              :            # lhs' = PHI <lhs>
   10334              :          ==>
   10335              :            loop;
   10336              :          BB:
   10337              :            # vec_lhs' = PHI <vec_lhs>
   10338              :            new_tree = lane_extract <vec_lhs', ...>;
   10339              :            lhs' = new_tree;  */
   10340              : 
   10341         2905 :       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   10342              :       /* Check if we have a loop where the chosen exit is not the main exit,
   10343              :          in these cases for an early break we restart the iteration the vector code
   10344              :          did.  For the live values we want the value at the start of the iteration
   10345              :          rather than at the end.  */
   10346         2905 :       edge main_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
   10347         2905 :       bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
   10348        15064 :       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
   10349         9254 :         if (!is_gimple_debug (use_stmt)
   10350         9254 :             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
   10351         2842 :           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
   10352              :             {
   10353         2842 :               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
   10354         2842 :                                            phi_arg_index_from_use (use_p));
   10355         2842 :               gcc_assert (loop_exit_edge_p (loop, e));
   10356         2842 :               bool main_exit_edge = e == main_e;
   10357         2842 :               tree tmp_vec_lhs = vec_lhs;
   10358         2842 :               tree tmp_bitstart = bitstart;
   10359              : 
   10360              :               /* For early exit where the exit is not in the BB that leads
   10361              :                  to the latch then we're restarting the iteration in the
   10362              :                  scalar loop.  So get the first live value.  */
   10363         2842 :               bool early_break_first_element_p
   10364         2842 :                 = all_exits_as_early_p || !main_exit_edge;
   10365         2842 :               if (early_break_first_element_p)
   10366              :                 {
   10367          199 :                   tmp_vec_lhs = vec_lhs0;
   10368          199 :                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
   10369              :                 }
   10370              : 
   10371         2842 :               gimple_stmt_iterator exit_gsi;
   10372         2842 :               tree new_tree
   10373         2842 :                   = vectorizable_live_operation_1 (loop_vinfo,
   10374              :                                                    e->dest, vectype,
   10375              :                                                    slp_node, bitsize,
   10376              :                                                    tmp_bitstart, tmp_vec_lhs,
   10377              :                                                    lhs_type, &exit_gsi);
   10378              : 
   10379         2842 :               auto gsi = gsi_for_stmt (use_stmt);
   10380         2842 :               tree lhs_phi = gimple_phi_result (use_stmt);
   10381         2842 :               remove_phi_node (&gsi, false);
   10382         2842 :               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
   10383         2842 :               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
   10384         2842 :               break;
   10385         2905 :             }
   10386              : 
   10387              :       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
   10388        12222 :       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
   10389         6412 :         gcc_assert (is_gimple_debug (use_stmt)
   10390         2905 :                     || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
   10391              :     }
   10392              :   else
   10393              :     {
   10394              :       /* For basic-block vectorization simply insert the lane-extraction.  */
   10395        35373 :       tree bftype = TREE_TYPE (vectype);
   10396        35373 :       if (VECTOR_BOOLEAN_TYPE_P (vectype))
   10397            2 :         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
   10398        35373 :       tree new_tree = build3 (BIT_FIELD_REF, bftype,
   10399              :                               vec_lhs, bitsize, bitstart);
   10400        35373 :       gimple_seq stmts = NULL;
   10401        35373 :       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
   10402              :                                        &stmts, true, NULL_TREE);
   10403        35373 :       if (TREE_CODE (new_tree) == SSA_NAME
   10404        70746 :           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
   10405            2 :         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
   10406        35373 :       if (is_a <gphi *> (vec_stmt))
   10407              :         {
   10408         2515 :           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
   10409         2515 :           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
   10410              :         }
   10411              :       else
   10412              :         {
   10413        32858 :           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
   10414        32858 :           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
   10415              :         }
   10416              : 
   10417              :       /* Replace use of lhs with newly computed result.  If the use stmt is a
   10418              :          single arg PHI, just replace all uses of PHI result.  It's necessary
   10419              :          because lcssa PHI defining lhs may be before newly inserted stmt.  */
   10420        35373 :       use_operand_p use_p;
   10421        35373 :       stmt_vec_info use_stmt_info;
   10422       208423 :       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
   10423       137677 :         if (!is_gimple_debug (use_stmt)
   10424       137677 :             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
   10425       101208 :                 || !PURE_SLP_STMT (use_stmt_info)))
   10426              :           {
   10427              :             /* ???  This can happen when the live lane ends up being
   10428              :                rooted in a vector construction code-generated by an
   10429              :                external SLP node (and code-generation for that already
   10430              :                happened).
   10431              :                Doing this is what would happen if that vector CTOR
   10432              :                were not code-generated yet so it is not too bad.
   10433              :                ???  In fact we'd likely want to avoid this situation
   10434              :                in the first place.  */
   10435        61590 :             if (TREE_CODE (new_tree) == SSA_NAME
   10436        61590 :                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
   10437        61590 :                 && gimple_code (use_stmt) != GIMPLE_PHI
   10438       116497 :                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
   10439              :                                                 use_stmt))
   10440              :               {
   10441            0 :                 if (dump_enabled_p ())
   10442            0 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10443              :                                    "Using original scalar computation for "
   10444              :                                    "live lane because use precedes vector "
   10445              :                                    "def\n");
   10446            0 :                 continue;
   10447              :               }
   10448       188946 :             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
   10449              :               {
   10450              :                 /* ???  It can also happen that we end up pulling a def into
   10451              :                    a loop where replacing out-of-loop uses would require
   10452              :                    a new LC SSA PHI node.  Retain the original scalar in
   10453              :                    those cases as well.  PR98064.  */
   10454        63678 :                 edge e;
   10455        63678 :                 if (TREE_CODE (new_tree) == SSA_NAME
   10456        63678 :                     && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
   10457        63678 :                     && (gimple_bb (use_stmt)->loop_father
   10458        63678 :                         != gimple_bb (vec_stmt)->loop_father)
   10459              :                     /* But a replacement in a LC PHI is OK.  This happens
   10460              :                        in gcc.dg/vect/bb-slp-57.c for example.  */
   10461         7303 :                     && (gimple_code (use_stmt) != GIMPLE_PHI
   10462         3161 :                         || (((e = phi_arg_edge_from_use (use_p)), true)
   10463         3161 :                             && !loop_exit_edge_p
   10464         3161 :                                   (gimple_bb (vec_stmt)->loop_father, e)))
   10465        69289 :                     && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
   10466         5611 :                                             gimple_bb (use_stmt)->loop_father))
   10467              :                   {
   10468            0 :                     if (dump_enabled_p ())
   10469            0 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10470              :                                        "Using original scalar computation for "
   10471              :                                        "live lane because there is an "
   10472              :                                        "out-of-loop definition for it\n");
   10473            0 :                     continue;
   10474              :                   }
   10475        63678 :                 SET_USE (use_p, new_tree);
   10476              :               }
   10477        61590 :             update_stmt (use_stmt);
   10478        35373 :           }
   10479              :     }
   10480              : 
   10481              :   return true;
   10482       261768 : }
   10483              : 
   10484              : /* Given loop represented by LOOP_VINFO, return true if computation of
   10485              :    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
   10486              :    otherwise.  */
   10487              : 
   10488              : static bool
   10489        61760 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
   10490              : {
   10491        61760 :   gcc_assert (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo));
   10492              : 
   10493              :   /* Constant case.  */
   10494        61760 :   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
   10495              :     {
   10496        35996 :       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
   10497        35996 :       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
   10498              : 
   10499        35996 :       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
   10500        35996 :       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
   10501        35996 :       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
   10502              :         return true;
   10503              :     }
   10504              : 
   10505        25764 :   widest_int max;
   10506        25764 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   10507              :   /* Check the upper bound of loop niters.  */
   10508        25764 :   if (get_max_loop_iterations (loop, &max))
   10509              :     {
   10510        25764 :       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
   10511        25764 :       signop sgn = TYPE_SIGN (type);
   10512        25764 :       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
   10513        25764 :       if (max < type_max)
   10514        25587 :         return true;
   10515        25764 :     }
   10516              :   return false;
   10517        25764 : }
   10518              : 
   10519              : /* Return a mask type with half the number of elements as OLD_TYPE,
   10520              :    given that it should have mode NEW_MODE.  */
   10521              : 
   10522              : tree
   10523         4795 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
   10524              : {
   10525         4795 :   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
   10526         4795 :   return build_truth_vector_type_for_mode (nunits, new_mode);
   10527              : }
   10528              : 
   10529              : /* Return a mask type with twice as many elements as OLD_TYPE,
   10530              :    given that it should have mode NEW_MODE.  */
   10531              : 
   10532              : tree
   10533         7208 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
   10534              : {
   10535         7208 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
   10536         7208 :   return build_truth_vector_type_for_mode (nunits, new_mode);
   10537              : }
   10538              : 
   10539              : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
   10540              :    contain a sequence of NVECTORS masks that each control a vector of type
   10541              :    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
   10542              :    these vector masks with the vector version of SCALAR_MASK.  */
   10543              : 
   10544              : void
   10545       105109 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
   10546              :                        unsigned int nvectors, tree vectype, tree scalar_mask)
   10547              : {
   10548       105109 :   gcc_assert (nvectors != 0);
   10549              : 
   10550       105109 :   if (scalar_mask)
   10551              :     {
   10552         4979 :       scalar_cond_masked_key cond (scalar_mask, nvectors);
   10553         4979 :       loop_vinfo->scalar_cond_masked_set.add (cond);
   10554              :     }
   10555              : 
   10556       105109 :   masks->mask_set.add (std::make_pair (vectype, nvectors));
   10557       105109 : }
   10558              : 
   10559              : /* Given a complete set of masks MASKS, extract mask number INDEX
   10560              :    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
   10561              :    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
   10562              : 
   10563              :    See the comment above vec_loop_masks for more details about the mask
   10564              :    arrangement.  */
   10565              : 
   10566              : tree
   10567          208 : vect_get_loop_mask (loop_vec_info loop_vinfo,
   10568              :                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
   10569              :                     unsigned int nvectors, tree vectype, unsigned int index)
   10570              : {
   10571          208 :   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
   10572              :       == vect_partial_vectors_while_ult)
   10573              :     {
   10574            0 :       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
   10575            0 :       tree mask_type = rgm->type;
   10576              : 
   10577              :       /* Populate the rgroup's mask array, if this is the first time we've
   10578              :          used it.  */
   10579            0 :       if (rgm->controls.is_empty ())
   10580              :         {
   10581            0 :           rgm->controls.safe_grow_cleared (nvectors, true);
   10582            0 :           for (unsigned int i = 0; i < nvectors; ++i)
   10583              :             {
   10584            0 :               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
   10585              :               /* Provide a dummy definition until the real one is available.  */
   10586            0 :               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
   10587            0 :               rgm->controls[i] = mask;
   10588              :             }
   10589              :         }
   10590              : 
   10591            0 :       tree mask = rgm->controls[index];
   10592            0 :       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
   10593            0 :                     TYPE_VECTOR_SUBPARTS (vectype)))
   10594              :         {
   10595              :           /* A loop mask for data type X can be reused for data type Y
   10596              :              if X has N times more elements than Y and if Y's elements
   10597              :              are N times bigger than X's.  In this case each sequence
   10598              :              of N elements in the loop mask will be all-zero or all-one.
   10599              :              We can then view-convert the mask so that each sequence of
   10600              :              N elements is replaced by a single element.  */
   10601            0 :           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
   10602              :                                   TYPE_VECTOR_SUBPARTS (vectype)));
   10603            0 :           gimple_seq seq = NULL;
   10604            0 :           mask_type = truth_type_for (vectype);
   10605            0 :           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
   10606            0 :           if (seq)
   10607            0 :             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
   10608              :         }
   10609            0 :       return mask;
   10610              :     }
   10611          208 :   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
   10612              :            == vect_partial_vectors_avx512)
   10613              :     {
   10614              :       /* The number of scalars per iteration and the number of vectors are
   10615              :          both compile-time constants.  */
   10616          208 :       unsigned int nscalars_per_iter
   10617          208 :         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
   10618          208 :                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
   10619              : 
   10620          208 :       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
   10621              : 
   10622              :       /* The stored nV is dependent on the mask type produced.  */
   10623          208 :       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
   10624              :                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
   10625              :                   == rgm->factor);
   10626          208 :       nvectors = rgm->factor;
   10627              : 
   10628              :       /* Populate the rgroup's mask array, if this is the first time we've
   10629              :          used it.  */
   10630          208 :       if (rgm->controls.is_empty ())
   10631              :         {
   10632           20 :           rgm->controls.safe_grow_cleared (nvectors, true);
   10633          106 :           for (unsigned int i = 0; i < nvectors; ++i)
   10634              :             {
   10635           86 :               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
   10636              :               /* Provide a dummy definition until the real one is available.  */
   10637           86 :               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
   10638           86 :               rgm->controls[i] = mask;
   10639              :             }
   10640              :         }
   10641          208 :       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
   10642              :                     TYPE_VECTOR_SUBPARTS (vectype)))
   10643          160 :         return rgm->controls[index];
   10644              : 
   10645              :       /* Split the vector if needed.  Since we are dealing with integer mode
   10646              :          masks with AVX512 we can operate on the integer representation
   10647              :          performing the whole vector shifting.  */
   10648           48 :       unsigned HOST_WIDE_INT factor;
   10649           48 :       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
   10650           48 :                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
   10651            0 :       gcc_assert (ok);
   10652           48 :       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
   10653           48 :       tree mask_type = truth_type_for (vectype);
   10654           48 :       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
   10655           48 :       unsigned vi = index / factor;
   10656           48 :       unsigned vpart = index % factor;
   10657           48 :       tree vec = rgm->controls[vi];
   10658           48 :       gimple_seq seq = NULL;
   10659           48 :       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
   10660           48 :                           lang_hooks.types.type_for_mode
   10661           48 :                                 (TYPE_MODE (rgm->type), 1), vec);
   10662              :       /* For integer mode masks simply shift the right bits into position.  */
   10663           48 :       if (vpart != 0)
   10664           40 :         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
   10665              :                             build_int_cst (integer_type_node,
   10666           80 :                                            (TYPE_VECTOR_SUBPARTS (vectype)
   10667           40 :                                             * vpart)));
   10668           48 :       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
   10669           48 :                                     (TYPE_MODE (mask_type), 1), vec);
   10670           48 :       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
   10671           48 :       if (seq)
   10672           48 :         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
   10673           48 :       return vec;
   10674              :     }
   10675              :   else
   10676            0 :     gcc_unreachable ();
   10677              : }
   10678              : 
   10679              : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
   10680              :    lengths for controlling an operation on VECTYPE.  The operation splits
   10681              :    each element of VECTYPE into FACTOR separate subelements, measuring the
   10682              :    length as a number of these subelements.  */
   10683              : 
   10684              : void
   10685            0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
   10686              :                       unsigned int nvectors, tree vectype, unsigned int factor)
   10687              : {
   10688            0 :   gcc_assert (nvectors != 0);
   10689            0 :   if (lens->length () < nvectors)
   10690            0 :     lens->safe_grow_cleared (nvectors, true);
   10691            0 :   rgroup_controls *rgl = &(*lens)[nvectors - 1];
   10692              : 
   10693              :   /* The number of scalars per iteration, scalar occupied bytes and
   10694              :      the number of vectors are both compile-time constants.  */
   10695            0 :   unsigned int nscalars_per_iter
   10696            0 :     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
   10697            0 :                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
   10698              : 
   10699            0 :   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
   10700              :     {
   10701              :       /* For now, we only support cases in which all loads and stores fall back
   10702              :          to VnQI or none do.  */
   10703            0 :       gcc_assert (!rgl->max_nscalars_per_iter
   10704              :                   || (rgl->factor == 1 && factor == 1)
   10705              :                   || (rgl->max_nscalars_per_iter * rgl->factor
   10706              :                       == nscalars_per_iter * factor));
   10707            0 :       rgl->max_nscalars_per_iter = nscalars_per_iter;
   10708            0 :       rgl->type = vectype;
   10709            0 :       rgl->factor = factor;
   10710              :     }
   10711            0 : }
   10712              : 
   10713              : /* Given a complete set of lengths LENS, extract length number INDEX
   10714              :    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
   10715              :    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
   10716              :    multiplied by the number of elements that should be processed.
   10717              :    Insert any set-up statements before GSI.  */
   10718              : 
   10719              : tree
   10720            0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
   10721              :                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
   10722              :                    unsigned int index, unsigned int factor, bool adjusted)
   10723              : {
   10724            0 :   rgroup_controls *rgl = &(*lens)[nvectors - 1];
   10725            0 :   bool use_bias_adjusted_len =
   10726            0 :     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
   10727              : 
   10728              :   /* Populate the rgroup's len array, if this is the first time we've
   10729              :      used it.  */
   10730            0 :   if (rgl->controls.is_empty ())
   10731              :     {
   10732            0 :       rgl->controls.safe_grow_cleared (nvectors, true);
   10733            0 :       for (unsigned int i = 0; i < nvectors; ++i)
   10734              :         {
   10735            0 :           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
   10736            0 :           gcc_assert (len_type != NULL_TREE);
   10737              : 
   10738            0 :           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
   10739              : 
   10740              :           /* Provide a dummy definition until the real one is available.  */
   10741            0 :           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
   10742            0 :           rgl->controls[i] = len;
   10743              : 
   10744            0 :           if (use_bias_adjusted_len)
   10745              :             {
   10746            0 :               gcc_assert (i == 0);
   10747            0 :               tree adjusted_len =
   10748            0 :                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
   10749            0 :               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
   10750            0 :               rgl->bias_adjusted_ctrl = adjusted_len;
   10751              :             }
   10752              :         }
   10753              :     }
   10754              : 
   10755            0 :   if (use_bias_adjusted_len && adjusted)
   10756            0 :     return rgl->bias_adjusted_ctrl;
   10757              : 
   10758            0 :   tree loop_len = rgl->controls[index];
   10759            0 :   if (rgl->factor == 1 && factor == 1)
   10760              :     {
   10761            0 :       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
   10762            0 :       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
   10763            0 :       if (maybe_ne (nunits1, nunits2))
   10764              :         {
   10765              :           /* A loop len for data type X can be reused for data type Y
   10766              :              if X has N times more elements than Y and if Y's elements
   10767              :              are N times bigger than X's.  */
   10768            0 :           gcc_assert (multiple_p (nunits1, nunits2));
   10769            0 :           factor = exact_div (nunits1, nunits2).to_constant ();
   10770            0 :           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   10771            0 :           gimple_seq seq = NULL;
   10772            0 :           loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
   10773            0 :                                    build_int_cst (iv_type, factor));
   10774            0 :           if (seq)
   10775            0 :             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
   10776              :         }
   10777            0 :     }
   10778            0 :   else if (factor && rgl->factor != factor)
   10779              :     {
   10780              :       /* The number of scalars per iteration, scalar occupied bytes and
   10781              :          the number of vectors are both compile-time constants.  */
   10782            0 :       unsigned int nscalars_per_iter
   10783            0 :         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
   10784            0 :                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
   10785            0 :       unsigned int rglvecsize = rgl->factor * rgl->max_nscalars_per_iter;
   10786            0 :       unsigned int vecsize = nscalars_per_iter * factor;
   10787            0 :       if (rglvecsize > vecsize)
   10788              :         {
   10789            0 :           unsigned int fac = rglvecsize / vecsize;
   10790            0 :           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   10791            0 :           gimple_seq seq = NULL;
   10792            0 :           loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
   10793            0 :                                    build_int_cst (iv_type, fac));
   10794            0 :           if (seq)
   10795            0 :             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
   10796              :         }
   10797            0 :       else if (rglvecsize < vecsize)
   10798              :         {
   10799            0 :           unsigned int fac = vecsize / rglvecsize;
   10800            0 :           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   10801            0 :           gimple_seq seq = NULL;
   10802            0 :           loop_len = gimple_build (&seq, MULT_EXPR, iv_type, loop_len,
   10803            0 :                                    build_int_cst (iv_type, fac));
   10804            0 :           if (seq)
   10805            0 :             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
   10806              :         }
   10807              :     }
   10808              :   return loop_len;
   10809              : }
   10810              : 
   10811              : /* Generate the tree for the loop len mask and return it.  Given the lens,
   10812              :    nvectors, vectype, index and factor to gen the len mask as below.
   10813              : 
   10814              :    tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
   10815              : */
   10816              : tree
   10817            0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
   10818              :                         gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
   10819              :                         unsigned int nvectors, tree vectype, tree stmt,
   10820              :                         unsigned int index, unsigned int factor)
   10821              : {
   10822            0 :   tree all_one_mask = build_all_ones_cst (vectype);
   10823            0 :   tree all_zero_mask = build_zero_cst (vectype);
   10824            0 :   tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
   10825              :                                 factor, true);
   10826            0 :   tree bias = build_int_cst (intQI_type_node,
   10827            0 :                              LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
   10828            0 :   tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
   10829            0 :   gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
   10830              :                                             all_one_mask, all_zero_mask, len,
   10831              :                                             bias);
   10832            0 :   gimple_call_set_lhs (call, len_mask);
   10833            0 :   gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
   10834              : 
   10835            0 :   return len_mask;
   10836              : }
   10837              : 
   10838              : /* Scale profiling counters by estimation for LOOP which is vectorized
   10839              :    by factor VF.
   10840              :    If FLAT is true, the loop we started with had unrealistically flat
   10841              :    profile.  */
   10842              : 
   10843              : static void
   10844        61803 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
   10845              : {
   10846              :   /* For flat profiles do not scale down proportionally by VF and only
   10847              :      cap by known iteration count bounds.  */
   10848        61803 :   if (flat)
   10849              :     {
   10850        34713 :       if (dump_file && (dump_flags & TDF_DETAILS))
   10851         5303 :         fprintf (dump_file,
   10852              :                  "Vectorized loop profile seems flat; not scaling iteration "
   10853              :                  "count down by the vectorization factor %i\n", vf);
   10854        34713 :       scale_loop_profile (loop, profile_probability::always (),
   10855              :                           get_likely_max_loop_iterations_int (loop));
   10856        34713 :       return;
   10857              :     }
   10858              :   /* Loop body executes VF fewer times and exit increases VF times.  */
   10859        27090 :   profile_count entry_count = loop_preheader_edge (loop)->count ();
   10860              : 
   10861              :   /* If we have unreliable loop profile avoid dropping entry
   10862              :      count below header count.  This can happen since loops
   10863              :      has unrealistically low trip counts.  */
   10864        27090 :   while (vf > 1
   10865        28156 :          && loop->header->count > entry_count
   10866        57342 :          && loop->header->count < entry_count * vf)
   10867              :     {
   10868         2096 :       if (dump_file && (dump_flags & TDF_DETAILS))
   10869          155 :         fprintf (dump_file,
   10870              :                  "Vectorization factor %i seems too large for profile "
   10871              :                  "previously believed to be consistent; reducing.\n", vf);
   10872         2096 :       vf /= 2;
   10873              :     }
   10874              : 
   10875        27090 :   if (entry_count.nonzero_p ())
   10876        27090 :     set_edge_probability_and_rescale_others
   10877        27090 :             (exit_e,
   10878        27090 :              entry_count.probability_in (loop->header->count / vf));
   10879              :   /* Avoid producing very large exit probability when we do not have
   10880              :      sensible profile.  */
   10881            0 :   else if (exit_e->probability < profile_probability::always () / (vf * 2))
   10882            0 :     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
   10883        27090 :   loop->latch->count = single_pred_edge (loop->latch)->count ();
   10884              : 
   10885        27090 :   scale_loop_profile (loop, profile_probability::always () / vf,
   10886              :                       get_likely_max_loop_iterations_int (loop));
   10887              : }
   10888              : 
   10889              : /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
   10890              :    original loop that has now been vectorized.
   10891              : 
   10892              :    The inits of the data_references need to be advanced with the number of
   10893              :    iterations of the main loop.  This has been computed in vect_do_peeling and
   10894              :    is stored in parameter ADVANCE.
   10895              : 
   10896              :    Since the loop_vec_info of this EPILOGUE was constructed for the original
   10897              :    loop, its stmt_vec_infos all point to the original statements.  These need
   10898              :    to be updated to point to their corresponding copies.
   10899              : 
   10900              :    The data_reference's connections also need to be updated.  Their
   10901              :    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
   10902              :    stmt_vec_infos, their statements need to point to their corresponding
   10903              :    copy.  */
   10904              : 
   10905              : static void
   10906         6847 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
   10907              : {
   10908         6847 :   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
   10909         6847 :   hash_map<tree,tree> mapping;
   10910         6847 :   gimple *orig_stmt, *new_stmt;
   10911         6847 :   gimple_stmt_iterator epilogue_gsi;
   10912         6847 :   gphi_iterator epilogue_phi_gsi;
   10913         6847 :   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
   10914         6847 :   basic_block *epilogue_bbs = get_loop_body (epilogue);
   10915         6847 :   unsigned i;
   10916              : 
   10917         6847 :   free (LOOP_VINFO_BBS (epilogue_vinfo));
   10918         6847 :   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
   10919         6847 :   LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
   10920              : 
   10921              :   /* The EPILOGUE loop is a copy of the original loop so they share the same
   10922              :      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
   10923              :      point to the copied statements.  */
   10924        20541 :   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
   10925              :     {
   10926        13694 :       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
   10927        35294 :            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
   10928              :         {
   10929        21600 :           new_stmt = epilogue_phi_gsi.phi ();
   10930              : 
   10931        21600 :           gcc_assert (gimple_uid (new_stmt) > 0);
   10932        21600 :           stmt_vinfo
   10933        21600 :             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
   10934              : 
   10935        21600 :           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
   10936              :         }
   10937              : 
   10938        27388 :       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
   10939       137238 :            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
   10940              :         {
   10941       123544 :           new_stmt = gsi_stmt (epilogue_gsi);
   10942       123544 :           if (is_gimple_debug (new_stmt))
   10943        20500 :             continue;
   10944              : 
   10945       103044 :           gcc_assert (gimple_uid (new_stmt) > 0);
   10946       103044 :           stmt_vinfo
   10947       103044 :             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
   10948              : 
   10949       103044 :           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
   10950              : 
   10951       103044 :           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
   10952       103044 :           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
   10953              :             {
   10954         1939 :               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
   10955              :               /* Set BB such that the assert in
   10956              :                 'get_initial_defs_for_reduction' is able to determine that
   10957              :                 the BB of the related stmt is inside this loop.  */
   10958         1939 :               gimple_set_bb (stmt,
   10959              :                              gimple_bb (new_stmt));
   10960         1939 :               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
   10961         1939 :               gcc_assert (related_vinfo == NULL
   10962              :                           || related_vinfo == stmt_vinfo);
   10963              :             }
   10964              :         }
   10965              :     }
   10966              : 
   10967         6847 :   struct data_reference *dr;
   10968         6847 :   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
   10969        30928 :   FOR_EACH_VEC_ELT (datarefs, i, dr)
   10970              :     {
   10971        24081 :       orig_stmt = DR_STMT (dr);
   10972        24081 :       gcc_assert (gimple_uid (orig_stmt) > 0);
   10973        24081 :       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
   10974        24081 :       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
   10975              :     }
   10976              : 
   10977              :   /* Advance data_reference's with the number of iterations of the previous
   10978              :      loop and its prologue.  */
   10979         6847 :   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
   10980              : 
   10981              :   /* Remember the advancement made.  */
   10982         6847 :   LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
   10983         6847 : }
   10984              : 
   10985              : /*  When vectorizing early break statements instructions that happen before
   10986              :     the early break in the current BB need to be moved to after the early
   10987              :     break.  This function deals with that and assumes that any validity
   10988              :     checks has already been performed.
   10989              : 
   10990              :     While moving the instructions if it encounters a VUSE or VDEF it then
   10991              :     corrects the VUSES as it moves the statements along.  GDEST is the location
   10992              :     in which to insert the new statements.  */
   10993              : 
   10994              : static void
   10995         1411 : move_early_exit_stmts (loop_vec_info loop_vinfo)
   10996              : {
   10997         1411 :   DUMP_VECT_SCOPE ("move_early_exit_stmts");
   10998              : 
   10999         1411 :   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
   11000         1192 :     return;
   11001              : 
   11002              :   /* Move all stmts that need moving.  */
   11003          219 :   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
   11004          219 :   gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
   11005              : 
   11006          219 :   tree last_seen_vuse = NULL_TREE;
   11007          537 :   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
   11008              :     {
   11009              :       /* We have to update crossed degenerate virtual PHIs.  Simply
   11010              :          elide them.  */
   11011          318 :       if (gphi *vphi = dyn_cast <gphi *> (stmt))
   11012              :         {
   11013            7 :           tree vdef = gimple_phi_result (vphi);
   11014            7 :           tree vuse = gimple_phi_arg_def (vphi, 0);
   11015            7 :           imm_use_iterator iter;
   11016            7 :           use_operand_p use_p;
   11017            7 :           gimple *use_stmt;
   11018           30 :           FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
   11019              :             {
   11020           48 :               FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
   11021           16 :                 SET_USE (use_p, vuse);
   11022            7 :             }
   11023            7 :           auto gsi = gsi_for_stmt (stmt);
   11024            7 :           remove_phi_node (&gsi, true);
   11025            7 :           last_seen_vuse = vuse;
   11026            7 :           continue;
   11027            7 :         }
   11028              : 
   11029              :       /* Check to see if statement is still required for vect or has been
   11030              :          elided.  */
   11031          311 :       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
   11032          311 :       if (!stmt_info)
   11033            0 :         continue;
   11034              : 
   11035          311 :       if (dump_enabled_p ())
   11036          160 :         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
   11037              : 
   11038          311 :       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
   11039          311 :       gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
   11040          622 :       last_seen_vuse = gimple_vuse (stmt);
   11041              :     }
   11042              : 
   11043              :   /* Update all the stmts with their new reaching VUSES.  */
   11044          689 :   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
   11045              :     {
   11046          198 :       if (dump_enabled_p ())
   11047          162 :           dump_printf_loc (MSG_NOTE, vect_location,
   11048              :                            "updating vuse to %T for load %G",
   11049              :                            last_seen_vuse, p);
   11050          198 :       gimple_set_vuse (p, last_seen_vuse);
   11051          198 :       update_stmt (p);
   11052              :     }
   11053              : 
   11054              :   /* And update the LC PHIs on exits.  */
   11055         1108 :   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
   11056          451 :     if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
   11057          245 :       if (gphi *phi = get_virtual_phi (e->dest))
   11058          464 :         SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
   11059              : }
   11060              : 
   11061              : /* Generate adjustment code for early break scalar IVs filling in the value
   11062              :    we created earlier on for LOOP_VINFO_EARLY_BRK_NITERS_VAR.  */
   11063              : 
   11064              : static void
   11065         1411 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vec_info loop_vinfo)
   11066              : {
   11067         1411 :   DUMP_VECT_SCOPE ("vect_update_ivs_after_vectorizer_for_early_breaks");
   11068              : 
   11069         1411 :   if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
   11070              :       /* If no peeling was done then we have no IV to update.  */
   11071         1411 :       || !LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo))
   11072          584 :     return;
   11073              : 
   11074          827 :   tree phi_var = LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo);
   11075          827 :   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
   11076          827 :   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   11077          827 :   tree ty_var = TREE_TYPE (phi_var);
   11078          827 :   auto loop = LOOP_VINFO_LOOP (loop_vinfo);
   11079          827 :   tree induc_var = niters_skip ? copy_ssa_name (phi_var) : phi_var;
   11080              : 
   11081              :   /* Remove the existing dummy GIMPLE statement and just keep the def.  */
   11082          827 :   gimple *def = SSA_NAME_DEF_STMT (phi_var);
   11083          827 :   auto def_gsi = gsi_for_stmt (def);
   11084          827 :   gsi_remove (&def_gsi, true);
   11085              : 
   11086          827 :   auto induction_phi = create_phi_node (induc_var, loop->header);
   11087          827 :   tree induc_def = PHI_RESULT (induction_phi);
   11088              : 
   11089              :   /* Create the iv update inside the loop.  */
   11090          827 :   gimple_seq init_stmts = NULL;
   11091          827 :   gimple_seq stmts = NULL;
   11092          827 :   gimple_seq iv_stmts = NULL;
   11093          827 :   tree tree_vf = build_int_cst (ty_var, vf);
   11094              : 
   11095              :   /* For loop len targets we have to use .SELECT_VL (ivtmp_33, VF); instead of
   11096              :      just += VF as the VF can change in between two loop iterations.  */
   11097          827 :   if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
   11098              :     {
   11099            0 :       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   11100            0 :       tree_vf = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
   11101              :                                    NULL_TREE, 0, 0, true);
   11102              :     }
   11103              : 
   11104          827 :   tree iter_var;
   11105          827 :   if (POINTER_TYPE_P (ty_var))
   11106              :     {
   11107            0 :       tree offset = gimple_convert (&stmts, sizetype, tree_vf);
   11108            0 :       iter_var = gimple_build (&stmts, POINTER_PLUS_EXPR, ty_var, induc_def,
   11109              :                                gimple_convert (&stmts, sizetype, offset));
   11110              :     }
   11111              :   else
   11112              :     {
   11113          827 :       tree offset = gimple_convert (&stmts, ty_var, tree_vf);
   11114          827 :       iter_var = gimple_build (&stmts, PLUS_EXPR, ty_var, induc_def, offset);
   11115              :     }
   11116              : 
   11117          827 :   tree init_var = build_zero_cst (ty_var);
   11118          827 :   if (niters_skip)
   11119            0 :     init_var = gimple_build (&init_stmts, MINUS_EXPR, ty_var, init_var,
   11120              :                              gimple_convert (&init_stmts, ty_var, niters_skip));
   11121              : 
   11122          827 :   add_phi_arg (induction_phi, iter_var,
   11123              :                loop_latch_edge (loop), UNKNOWN_LOCATION);
   11124          827 :   add_phi_arg (induction_phi, init_var,
   11125              :                loop_preheader_edge (loop), UNKNOWN_LOCATION);
   11126              : 
   11127              :   /* Find the first insertion point in the BB.  */
   11128          827 :   auto pe = loop_preheader_edge (loop);
   11129              : 
   11130              :   /* If we've done any peeling, calculate the peeling adjustment needed to the
   11131              :      final IV.  */
   11132          827 :   if (niters_skip)
   11133              :     {
   11134            0 :       tree induc_type = TREE_TYPE (induc_def);
   11135            0 :       tree s_induc_type = signed_type_for (induc_type);
   11136            0 :       induc_def = gimple_build (&iv_stmts, MAX_EXPR, s_induc_type,
   11137              :                                 gimple_convert (&iv_stmts, s_induc_type,
   11138              :                                                 induc_def),
   11139              :                                 build_zero_cst (s_induc_type));
   11140            0 :       auto stmt = gimple_build_assign (phi_var,
   11141              :                                        gimple_convert (&iv_stmts, induc_type,
   11142              :                                                        induc_def));
   11143            0 :       gimple_seq_add_stmt_without_update (&iv_stmts, stmt);
   11144            0 :       basic_block exit_bb = NULL;
   11145              :       /* Identify the early exit merge block.  I wish we had stored this.  */
   11146            0 :       for (auto e : get_loop_exit_edges (loop))
   11147            0 :         if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
   11148              :           {
   11149            0 :             exit_bb = e->dest;
   11150            0 :             break;
   11151            0 :           }
   11152              : 
   11153            0 :       gcc_assert (exit_bb);
   11154            0 :       auto exit_gsi = gsi_after_labels (exit_bb);
   11155            0 :       gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
   11156              :     }
   11157              :   /* Write the init_stmts in the loop-preheader block.  */
   11158          827 :   auto psi = gsi_last_nondebug_bb (pe->src);
   11159          827 :   gsi_insert_seq_after (&psi, init_stmts, GSI_LAST_NEW_STMT);
   11160              :   /* Write the adjustments in the header block.  */
   11161          827 :   basic_block bb = loop->header;
   11162          827 :   auto si = gsi_after_labels (bb);
   11163          827 :   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
   11164              : }
   11165              : 
   11166              : /* Function vect_transform_loop.
   11167              : 
   11168              :    The analysis phase has determined that the loop is vectorizable.
   11169              :    Vectorize the loop - created vectorized stmts to replace the scalar
   11170              :    stmts in the loop, and update the loop exit condition.
   11171              :    Returns scalar epilogue loop if any.  */
   11172              : 
   11173              : class loop *
   11174        61803 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   11175              : {
   11176        61803 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   11177        61803 :   class loop *epilogue = NULL;
   11178        61803 :   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   11179        61803 :   int nbbs = loop->num_nodes;
   11180        61803 :   int i;
   11181        61803 :   tree niters_vector = NULL_TREE;
   11182        61803 :   tree step_vector = NULL_TREE;
   11183        61803 :   tree niters_vector_mult_vf = NULL_TREE;
   11184        61803 :   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   11185        61803 :   unsigned int lowest_vf = constant_lower_bound (vf);
   11186        61803 :   gimple *stmt;
   11187        61803 :   bool check_profitability = false;
   11188        61803 :   unsigned int th;
   11189        61803 :   bool flat = maybe_flat_loop_profile (loop);
   11190        61803 :   bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
   11191              : 
   11192        61803 :   DUMP_VECT_SCOPE ("vec_transform_loop");
   11193              : 
   11194        61803 :   if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
   11195        54956 :     loop_vinfo->shared->check_datarefs ();
   11196              : 
   11197              :   /* Use the more conservative vectorization threshold.  If the number
   11198              :      of iterations is constant assume the cost check has been performed
   11199              :      by our caller.  If the threshold makes all loops profitable that
   11200              :      run at least the (estimated) vectorization factor number of times
   11201              :      checking is pointless, too.  */
   11202        61803 :   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
   11203        61803 :   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
   11204              :     {
   11205        18734 :       if (dump_enabled_p ())
   11206          176 :         dump_printf_loc (MSG_NOTE, vect_location,
   11207              :                          "Profitability threshold is %d loop iterations.\n",
   11208              :                          th);
   11209              :       check_profitability = true;
   11210              :     }
   11211              : 
   11212              :   /* Make sure there exists a single-predecessor exit bb.  Do this before
   11213              :      versioning.   */
   11214        61803 :   edge e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
   11215        61803 :   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
   11216              :     {
   11217        19098 :       split_loop_exit_edge (e, true);
   11218        19098 :       if (dump_enabled_p ())
   11219         2287 :         dump_printf (MSG_NOTE, "split exit edge\n");
   11220              :     }
   11221              : 
   11222              :   /* Version the loop first, if required, so the profitability check
   11223              :      comes first.  */
   11224              : 
   11225        61803 :   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
   11226              :     {
   11227         3749 :       class loop *sloop
   11228         3749 :         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
   11229         3749 :       sloop->force_vectorize = false;
   11230         3749 :       check_profitability = false;
   11231              :     }
   11232              : 
   11233              :   /* Make sure there exists a single-predecessor exit bb also on the
   11234              :      scalar loop copy.  Do this after versioning but before peeling
   11235              :      so CFG structure is fine for both scalar and if-converted loop
   11236              :      to make slpeel_duplicate_current_defs_from_edges face matched
   11237              :      loop closed PHI nodes on the exit.  */
   11238        61803 :   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
   11239              :     {
   11240         8067 :       e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
   11241         8067 :       if (! single_pred_p (e->dest))
   11242              :         {
   11243         7807 :           split_loop_exit_edge (e, true);
   11244         7807 :           if (dump_enabled_p ())
   11245         1148 :             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
   11246              :         }
   11247              :     }
   11248              : 
   11249        61803 :   tree niters = vect_build_loop_niters (loop_vinfo);
   11250        61803 :   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
   11251        61803 :   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
   11252        61803 :   tree advance;
   11253        61803 :   drs_init_vec orig_drs_init;
   11254        61803 :   bool niters_no_overflow = uncounted_p ? false /* Not known.  */
   11255        61760 :                                         : loop_niters_no_overflow (loop_vinfo);
   11256              : 
   11257        61803 :   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
   11258              :                               &step_vector, &niters_vector_mult_vf, th,
   11259              :                               check_profitability, niters_no_overflow,
   11260              :                               &advance);
   11261              : 
   11262              :   /* Assign hierarchical discriminators to the vectorized loop.  */
   11263        61803 :   poly_uint64 vf_val = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   11264        61803 :   unsigned int vf_int = constant_lower_bound (vf_val);
   11265        61803 :   if (vf_int > DISCR_MULTIPLICITY_MAX)
   11266              :     vf_int = DISCR_MULTIPLICITY_MAX;
   11267              : 
   11268              :   /* Assign unique copy_id dynamically instead of using hardcoded constants.
   11269              :      Epilogue and main vectorized loops get different copy_ids.  */
   11270        61803 :   gimple *loop_last = last_nondebug_stmt (loop->header);
   11271        61803 :   location_t loop_loc
   11272        61803 :     = loop_last ? gimple_location (loop_last) : UNKNOWN_LOCATION;
   11273        61525 :   if (loop_loc != UNKNOWN_LOCATION)
   11274              :     {
   11275        50884 :       unsigned int copyid = allocate_copyid_base (loop_loc, 1);
   11276        50884 :       assign_discriminators_to_loop (loop, vf_int, copyid);
   11277              :     }
   11278        61803 :   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
   11279        61803 :       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
   11280              :     {
   11281              :       /* Ifcvt duplicates loop preheader, loop body and produces an basic
   11282              :          block after loop exit.  We need to scale all that.  */
   11283           88 :       basic_block preheader
   11284           88 :         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
   11285           88 :       preheader->count
   11286              :         = preheader->count.apply_probability
   11287           88 :               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
   11288           88 :       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
   11289              :                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
   11290           88 :       LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo)->dest->count = preheader->count;
   11291              :     }
   11292              : 
   11293        61803 :   if (niters_vector == NULL_TREE && !uncounted_p)
   11294              :     {
   11295        28141 :       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
   11296        28141 :           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
   11297        57069 :           && known_eq (lowest_vf, vf))
   11298              :         {
   11299        28138 :           niters_vector
   11300        28138 :             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
   11301        28138 :                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
   11302        28138 :           step_vector = build_one_cst (TREE_TYPE (niters));
   11303              :         }
   11304          793 :       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
   11305            1 :         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
   11306              :                                      &step_vector, niters_no_overflow);
   11307              :       else
   11308              :         /* vect_do_peeling subtracted the number of peeled prologue
   11309              :            iterations from LOOP_VINFO_NITERS.  */
   11310          792 :         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
   11311              :                                      &niters_vector, &step_vector,
   11312              :                                      niters_no_overflow);
   11313              :     }
   11314              : 
   11315              :   /* 1) Make sure the loop header has exactly two entries
   11316              :      2) Make sure we have a preheader basic block.  */
   11317              : 
   11318        61803 :   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
   11319              : 
   11320        61803 :   split_edge (loop_preheader_edge (loop));
   11321              : 
   11322        61803 :   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
   11323              :     /* This will deal with any possible peeling.  */
   11324            1 :     vect_prepare_for_masked_peels (loop_vinfo);
   11325              : 
   11326              :   /* Handle any code motion that we need to for early-break vectorization after
   11327              :      we've done peeling but just before we start vectorizing.  */
   11328        61803 :   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
   11329              :     {
   11330         1411 :       vect_update_ivs_after_vectorizer_for_early_breaks (loop_vinfo);
   11331         1411 :       move_early_exit_stmts (loop_vinfo);
   11332              :     }
   11333              : 
   11334              :   /* Remove existing clobber stmts and prefetches.  */
   11335       188728 :   for (i = 0; i < nbbs; i++)
   11336              :     {
   11337       126925 :       basic_block bb = bbs[i];
   11338      1095032 :       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
   11339              :         {
   11340       841182 :           stmt = gsi_stmt (si);
   11341       841182 :           if (gimple_clobber_p (stmt)
   11342       841182 :               || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
   11343              :             {
   11344           88 :               unlink_stmt_vdef (stmt);
   11345           88 :               gsi_remove (&si, true);
   11346           88 :               release_defs (stmt);
   11347              :             }
   11348              :           else
   11349       841094 :             gsi_next (&si);
   11350              :         }
   11351              :     }
   11352              : 
   11353              :   /* Schedule the SLP instances.  */
   11354        61803 :   if (!loop_vinfo->slp_instances.is_empty ())
   11355              :     {
   11356        61803 :       DUMP_VECT_SCOPE ("scheduling SLP instances");
   11357        61803 :       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
   11358              :     }
   11359              : 
   11360              :   /* Generate the loop invariant statements.  */
   11361        61803 :   if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
   11362              :     {
   11363           73 :       if (dump_enabled_p ())
   11364           30 :         dump_printf_loc (MSG_NOTE, vect_location,
   11365              :                          "------>generating loop invariant statements\n");
   11366           73 :       gimple_stmt_iterator gsi;
   11367           73 :       gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
   11368           73 :       gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
   11369              :                              GSI_CONTINUE_LINKING);
   11370              :     }
   11371              : 
   11372              :   /* Stub out scalar statements that must not survive vectorization and
   11373              :      were not picked as relevant in any SLP instance.
   11374              :      Doing this here helps with grouped statements, or statements that
   11375              :      are involved in patterns.  */
   11376       188728 :   for (i = 0; i < nbbs; i++)
   11377              :     {
   11378       126925 :       basic_block bb = bbs[i];
   11379       126925 :       stmt_vec_info stmt_info;
   11380       253850 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
   11381      1681355 :            !gsi_end_p (gsi); gsi_next (&gsi))
   11382              :         {
   11383      1554430 :           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
   11384         6348 :           if (!call || !gimple_call_internal_p (call))
   11385      1549241 :             continue;
   11386         5189 :           internal_fn ifn = gimple_call_internal_fn (call);
   11387         5189 :           if (ifn == IFN_MASK_LOAD)
   11388              :             {
   11389          737 :               tree lhs = gimple_get_lhs (call);
   11390          737 :               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
   11391              :                 {
   11392            0 :                   tree zero = build_zero_cst (TREE_TYPE (lhs));
   11393            0 :                   gimple *new_stmt = gimple_build_assign (lhs, zero);
   11394            0 :                   gsi_replace (&gsi, new_stmt, true);
   11395              :                 }
   11396              :             }
   11397         4452 :           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
   11398              :             {
   11399         2297 :               tree lhs = gimple_get_lhs (call);
   11400         2297 :               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
   11401              :                 {
   11402            0 :                   tree else_arg
   11403            0 :                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
   11404            0 :                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
   11405            0 :                   gsi_replace (&gsi, new_stmt, true);
   11406              :                 }
   11407              :             }
   11408         2155 :           else if (ifn == IFN_MASK_CALL
   11409            4 :                    && (stmt_info = loop_vinfo->lookup_stmt (call))
   11410            4 :                    && !STMT_VINFO_RELEVANT_P (stmt_info)
   11411         2159 :                    && !STMT_VINFO_LIVE_P (stmt_info))
   11412              :             {
   11413            4 :               gcc_assert (!gimple_call_lhs (stmt_info->stmt));
   11414            4 :               loop_vinfo->remove_stmt (stmt_info);
   11415              :             }
   11416              :         }
   11417              :     }
   11418              : 
   11419        61803 :   if (!uncounted_p)
   11420              :     {
   11421              :       /* The vectorization factor is always > 1, so if we use an IV increment of
   11422              :          1.  A zero NITERS becomes a nonzero NITERS_VECTOR.  */
   11423        61760 :       if (integer_onep (step_vector))
   11424        61742 :         niters_no_overflow = true;
   11425              : 
   11426        61760 :       vect_set_loop_condition (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
   11427              :                                loop_vinfo, niters_vector, step_vector,
   11428        61760 :                                niters_vector_mult_vf, !niters_no_overflow);
   11429              :     }
   11430              : 
   11431        61803 :   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
   11432              : 
   11433              :   /* True if the final iteration might not handle a full vector's
   11434              :      worth of scalar iterations.  */
   11435       123606 :   bool final_iter_may_be_partial
   11436        61803 :     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
   11437        61803 :       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
   11438              : 
   11439              :   /* +1 to convert latch counts to loop iteration counts.  */
   11440        61803 :   int bias_for_lowest = 1;
   11441              : 
   11442              :   /* When we are peeling for gaps then we take away one scalar iteration
   11443              :      from the vector loop.  Thus we can adjust the upper bound by one
   11444              :      scalar iteration.  But only when we know the bound applies to the
   11445              :      IV exit test which might not be true when we have multiple exits.  */
   11446        61803 :   if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
   11447       120407 :     bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
   11448              : 
   11449        61803 :   int bias_for_assumed = bias_for_lowest;
   11450        61803 :   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   11451        61803 :   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
   11452              :     {
   11453              :       /* When the amount of peeling is known at compile time, the first
   11454              :          iteration will have exactly alignment_npeels active elements.
   11455              :          In the worst case it will have at least one.  */
   11456            1 :       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
   11457            1 :       bias_for_lowest += lowest_vf - min_first_active;
   11458            1 :       bias_for_assumed += assumed_vf - min_first_active;
   11459              :     }
   11460              :   /* In these calculations the "- 1" converts loop iteration counts
   11461              :      back to latch counts.  */
   11462        61803 :   if (loop->any_upper_bound)
   11463              :     {
   11464        61787 :       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
   11465        61787 :       loop->nb_iterations_upper_bound
   11466        61787 :         = (final_iter_may_be_partial
   11467        63200 :            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
   11468         2826 :                             lowest_vf) - 1
   11469        60374 :            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
   11470       120748 :                              lowest_vf) - 1);
   11471        61787 :       if (main_vinfo
   11472              :           /* Both peeling for alignment and peeling for gaps can end up
   11473              :              with the scalar epilogue running for more than VF-1 iterations.  */
   11474         6847 :           && !main_vinfo->peeling_for_alignment
   11475         6799 :           && !main_vinfo->peeling_for_gaps)
   11476              :         {
   11477         6616 :           unsigned int bound;
   11478         6616 :           poly_uint64 main_iters
   11479         6616 :             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
   11480              :                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
   11481         6616 :           main_iters
   11482         6616 :             = upper_bound (main_iters,
   11483         6616 :                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
   11484        13232 :           if (can_div_away_from_zero_p (main_iters,
   11485         6616 :                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
   11486              :                                         &bound))
   11487         6616 :             loop->nb_iterations_upper_bound
   11488         6616 :               = wi::umin ((bound_wide_int) (bound - 1),
   11489         6616 :                           loop->nb_iterations_upper_bound);
   11490              :       }
   11491              :   }
   11492        61803 :   if (loop->any_likely_upper_bound)
   11493        61787 :     loop->nb_iterations_likely_upper_bound
   11494        61787 :       = (final_iter_may_be_partial
   11495        63200 :          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
   11496         1413 :                           + bias_for_lowest, lowest_vf) - 1
   11497        60374 :          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
   11498        61787 :                            + bias_for_lowest, lowest_vf) - 1);
   11499        61803 :   if (loop->any_estimate)
   11500        35614 :     loop->nb_iterations_estimate
   11501        35614 :       = (final_iter_may_be_partial
   11502        36307 :          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
   11503         1386 :                           assumed_vf) - 1
   11504        34921 :          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
   11505        70535 :                            assumed_vf) - 1);
   11506        61803 :   scale_profile_for_vect_loop (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
   11507              :                                assumed_vf, flat);
   11508              : 
   11509        61803 :   if (dump_enabled_p ())
   11510              :     {
   11511        11015 :       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
   11512              :         {
   11513         9559 :           dump_printf_loc (MSG_NOTE, vect_location,
   11514              :                            "LOOP VECTORIZED\n");
   11515         9559 :           if (loop->inner)
   11516          345 :             dump_printf_loc (MSG_NOTE, vect_location,
   11517              :                              "OUTER LOOP VECTORIZED\n");
   11518         9559 :           dump_printf (MSG_NOTE, "\n");
   11519              :         }
   11520              :       else
   11521         1456 :         dump_printf_loc (MSG_NOTE, vect_location,
   11522              :                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
   11523         1456 :                          GET_MODE_NAME (loop_vinfo->vector_mode));
   11524              :     }
   11525              : 
   11526              :   /* Loops vectorized with a variable factor won't benefit from
   11527              :      unrolling/peeling.  */
   11528        61803 :   if (!vf.is_constant ())
   11529              :     {
   11530              :       loop->unroll = 1;
   11531              :       if (dump_enabled_p ())
   11532              :         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
   11533              :                          " variable-length vectorization factor\n");
   11534              :     }
   11535              : 
   11536              :   /* When we have unrolled the loop due to a user requested value we should
   11537              :      leave it up to the RTL unroll heuristics to determine if it's still worth
   11538              :      while to unroll more.  */
   11539        61803 :   if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
   11540           44 :     loop->unroll = 0;
   11541              : 
   11542              :   /* Free SLP instances here because otherwise stmt reference counting
   11543              :      won't work.  */
   11544              :   slp_instance instance;
   11545       151888 :   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
   11546        90085 :     vect_free_slp_instance (instance);
   11547        61803 :   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
   11548              :   /* Clear-up safelen field since its value is invalid after vectorization
   11549              :      since vectorized loop can have loop-carried dependencies.  */
   11550        61803 :   loop->safelen = 0;
   11551              : 
   11552        61803 :   if (epilogue)
   11553              :     {
   11554              :       /* Accumulate past advancements made.  */
   11555         6847 :       if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
   11556           75 :         advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
   11557              :                                LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
   11558              :                                advance);
   11559         6847 :       update_epilogue_loop_vinfo (epilogue, advance);
   11560              : 
   11561         6847 :       epilogue->simduid = loop->simduid;
   11562         6847 :       epilogue->force_vectorize = loop->force_vectorize;
   11563         6847 :       epilogue->dont_vectorize = false;
   11564              :     }
   11565              : 
   11566        61803 :   return epilogue;
   11567        61803 : }
   11568              : 
   11569              : /* The code below is trying to perform simple optimization - revert
   11570              :    if-conversion for masked stores, i.e. if the mask of a store is zero
   11571              :    do not perform it and all stored value producers also if possible.
   11572              :    For example,
   11573              :      for (i=0; i<n; i++)
   11574              :        if (c[i])
   11575              :         {
   11576              :           p1[i] += 1;
   11577              :           p2[i] = p3[i] +2;
   11578              :         }
   11579              :    this transformation will produce the following semi-hammock:
   11580              : 
   11581              :    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
   11582              :      {
   11583              :        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
   11584              :        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
   11585              :        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
   11586              :        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
   11587              :        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
   11588              :        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
   11589              :      }
   11590              : */
   11591              : 
   11592              : void
   11593          493 : optimize_mask_stores (class loop *loop)
   11594              : {
   11595          493 :   basic_block *bbs = get_loop_body (loop);
   11596          493 :   unsigned nbbs = loop->num_nodes;
   11597          493 :   unsigned i;
   11598          493 :   basic_block bb;
   11599          493 :   class loop *bb_loop;
   11600          493 :   gimple_stmt_iterator gsi;
   11601          493 :   gimple *stmt;
   11602          493 :   auto_vec<gimple *> worklist;
   11603          493 :   auto_purge_vect_location sentinel;
   11604              : 
   11605          493 :   vect_location = find_loop_location (loop);
   11606              :   /* Pick up all masked stores in loop if any.  */
   11607         1972 :   for (i = 0; i < nbbs; i++)
   11608              :     {
   11609          986 :       bb = bbs[i];
   11610        17311 :       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
   11611        15339 :            gsi_next (&gsi))
   11612              :         {
   11613        15339 :           stmt = gsi_stmt (gsi);
   11614        15339 :           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
   11615          695 :             worklist.safe_push (stmt);
   11616              :         }
   11617              :     }
   11618              : 
   11619          493 :   free (bbs);
   11620          493 :   if (worklist.is_empty ())
   11621           68 :     return;
   11622              : 
   11623              :   /* Loop has masked stores.  */
   11624         1103 :   while (!worklist.is_empty ())
   11625              :     {
   11626          678 :       gimple *last, *last_store;
   11627          678 :       edge e, efalse;
   11628          678 :       tree mask;
   11629          678 :       basic_block store_bb, join_bb;
   11630          678 :       gimple_stmt_iterator gsi_to;
   11631          678 :       tree vdef, new_vdef;
   11632          678 :       gphi *phi;
   11633          678 :       tree vectype;
   11634          678 :       tree zero;
   11635              : 
   11636          678 :       last = worklist.pop ();
   11637          678 :       mask = gimple_call_arg (last, 2);
   11638          678 :       bb = gimple_bb (last);
   11639              :       /* Create then_bb and if-then structure in CFG, then_bb belongs to
   11640              :          the same loop as if_bb.  It could be different to LOOP when two
   11641              :          level loop-nest is vectorized and mask_store belongs to the inner
   11642              :          one.  */
   11643          678 :       e = split_block (bb, last);
   11644          678 :       bb_loop = bb->loop_father;
   11645          678 :       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
   11646          678 :       join_bb = e->dest;
   11647          678 :       store_bb = create_empty_bb (bb);
   11648          678 :       add_bb_to_loop (store_bb, bb_loop);
   11649          678 :       e->flags = EDGE_TRUE_VALUE;
   11650          678 :       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
   11651              :       /* Put STORE_BB to likely part.  */
   11652          678 :       efalse->probability = profile_probability::likely ();
   11653          678 :       e->probability = efalse->probability.invert ();
   11654          678 :       store_bb->count = efalse->count ();
   11655          678 :       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
   11656          678 :       if (dom_info_available_p (CDI_DOMINATORS))
   11657          678 :         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
   11658          678 :       if (dump_enabled_p ())
   11659          351 :         dump_printf_loc (MSG_NOTE, vect_location,
   11660              :                          "Create new block %d to sink mask stores.",
   11661              :                          store_bb->index);
   11662              :       /* Create vector comparison with boolean result.  */
   11663          678 :       vectype = TREE_TYPE (mask);
   11664          678 :       zero = build_zero_cst (vectype);
   11665          678 :       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
   11666          678 :       gsi = gsi_last_bb (bb);
   11667          678 :       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
   11668              :       /* Create new PHI node for vdef of the last masked store:
   11669              :          .MEM_2 = VDEF <.MEM_1>
   11670              :          will be converted to
   11671              :          .MEM.3 = VDEF <.MEM_1>
   11672              :          and new PHI node will be created in join bb
   11673              :          .MEM_2 = PHI <.MEM_1, .MEM_3>
   11674              :       */
   11675          678 :       vdef = gimple_vdef (last);
   11676          678 :       new_vdef = make_ssa_name (gimple_vop (cfun), last);
   11677          678 :       gimple_set_vdef (last, new_vdef);
   11678          678 :       phi = create_phi_node (vdef, join_bb);
   11679          678 :       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
   11680              : 
   11681              :       /* Put all masked stores with the same mask to STORE_BB if possible.  */
   11682          712 :       while (true)
   11683              :         {
   11684          695 :           gimple_stmt_iterator gsi_from;
   11685          695 :           gimple *stmt1 = NULL;
   11686              : 
   11687              :           /* Move masked store to STORE_BB.  */
   11688          695 :           last_store = last;
   11689          695 :           gsi = gsi_for_stmt (last);
   11690          695 :           gsi_from = gsi;
   11691              :           /* Shift GSI to the previous stmt for further traversal.  */
   11692          695 :           gsi_prev (&gsi);
   11693          695 :           gsi_to = gsi_start_bb (store_bb);
   11694          695 :           gsi_move_before (&gsi_from, &gsi_to);
   11695              :           /* Setup GSI_TO to the non-empty block start.  */
   11696          695 :           gsi_to = gsi_start_bb (store_bb);
   11697          695 :           if (dump_enabled_p ())
   11698          367 :             dump_printf_loc (MSG_NOTE, vect_location,
   11699              :                              "Move stmt to created bb\n%G", last);
   11700              :           /* Move all stored value producers if possible.  */
   11701         4960 :           while (!gsi_end_p (gsi))
   11702              :             {
   11703         4959 :               tree lhs;
   11704         4959 :               imm_use_iterator imm_iter;
   11705         4959 :               use_operand_p use_p;
   11706         4959 :               bool res;
   11707              : 
   11708              :               /* Skip debug statements.  */
   11709         4959 :               if (is_gimple_debug (gsi_stmt (gsi)))
   11710              :                 {
   11711            3 :                   gsi_prev (&gsi);
   11712         3225 :                   continue;
   11713              :                 }
   11714         4956 :               stmt1 = gsi_stmt (gsi);
   11715              :               /* Do not consider statements writing to memory or having
   11716              :                  volatile operand.  */
   11717         9762 :               if (gimple_vdef (stmt1)
   11718         9762 :                   || gimple_has_volatile_ops (stmt1))
   11719              :                 break;
   11720         4806 :               gsi_from = gsi;
   11721         4806 :               gsi_prev (&gsi);
   11722         4806 :               lhs = gimple_get_lhs (stmt1);
   11723         4806 :               if (!lhs)
   11724              :                 break;
   11725              : 
   11726              :               /* LHS of vectorized stmt must be SSA_NAME.  */
   11727         4806 :               if (TREE_CODE (lhs) != SSA_NAME)
   11728              :                 break;
   11729              : 
   11730         4806 :               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
   11731              :                 {
   11732              :                   /* Remove dead scalar statement.  */
   11733         3554 :                   if (has_zero_uses (lhs))
   11734              :                     {
   11735         3222 :                       gsi_remove (&gsi_from, true);
   11736         3222 :                       release_defs (stmt1);
   11737         3222 :                       continue;
   11738              :                     }
   11739              :                 }
   11740              : 
   11741              :               /* Check that LHS does not have uses outside of STORE_BB.  */
   11742         1584 :               res = true;
   11743         4309 :               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
   11744              :                 {
   11745         1685 :                   gimple *use_stmt;
   11746         1685 :                   use_stmt = USE_STMT (use_p);
   11747         1685 :                   if (is_gimple_debug (use_stmt))
   11748            0 :                     continue;
   11749         1685 :                   if (gimple_bb (use_stmt) != store_bb)
   11750              :                     {
   11751              :                       res = false;
   11752              :                       break;
   11753              :                     }
   11754         1584 :                 }
   11755         1584 :               if (!res)
   11756              :                 break;
   11757              : 
   11758         1040 :               if (gimple_vuse (stmt1)
   11759         1476 :                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
   11760              :                 break;
   11761              : 
   11762              :               /* Can move STMT1 to STORE_BB.  */
   11763         1040 :               if (dump_enabled_p ())
   11764          563 :                 dump_printf_loc (MSG_NOTE, vect_location,
   11765              :                                  "Move stmt to created bb\n%G", stmt1);
   11766         1040 :               gsi_move_before (&gsi_from, &gsi_to);
   11767              :               /* Shift GSI_TO for further insertion.  */
   11768         2080 :               gsi_prev (&gsi_to);
   11769              :             }
   11770              :           /* Put other masked stores with the same mask to STORE_BB.  */
   11771          695 :           if (worklist.is_empty ()
   11772          270 :               || gimple_call_arg (worklist.last (), 2) != mask
   11773           17 :               || worklist.last () != stmt1)
   11774              :             break;
   11775           17 :           last = worklist.pop ();
   11776           17 :         }
   11777         1356 :       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
   11778              :     }
   11779          493 : }
   11780              : 
   11781              : /* Decide whether it is possible to use a zero-based induction variable
   11782              :    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
   11783              :    the value that the induction variable must be able to hold in order
   11784              :    to ensure that the rgroups eventually have no active vector elements.
   11785              :    Return -1 otherwise.  */
   11786              : 
   11787              : widest_int
   11788        46786 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
   11789              : {
   11790        46786 :   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
   11791        46786 :   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   11792        46786 :   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
   11793              : 
   11794              :   /* Calculate the value that the induction variable must be able
   11795              :      to hit in order to ensure that we end the loop with an all-false mask.
   11796              :      This involves adding the maximum number of inactive trailing scalar
   11797              :      iterations.  */
   11798        46786 :   widest_int iv_limit = -1;
   11799        46786 :   if (max_loop_iterations (loop, &iv_limit))
   11800              :     {
   11801        46786 :       if (niters_skip)
   11802              :         {
   11803              :           /* Add the maximum number of skipped iterations to the
   11804              :              maximum iteration count.  */
   11805            0 :           if (TREE_CODE (niters_skip) == INTEGER_CST)
   11806            0 :             iv_limit += wi::to_widest (niters_skip);
   11807              :           else
   11808            0 :             iv_limit += max_vf - 1;
   11809              :         }
   11810        46786 :       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
   11811              :         /* Make a conservatively-correct assumption.  */
   11812          320 :         iv_limit += max_vf - 1;
   11813              : 
   11814              :       /* IV_LIMIT is the maximum number of latch iterations, which is also
   11815              :          the maximum in-range IV value.  Round this value down to the previous
   11816              :          vector alignment boundary and then add an extra full iteration.  */
   11817        46786 :       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   11818        46786 :       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
   11819              :     }
   11820        46786 :   return iv_limit;
   11821              : }
   11822              : 
   11823              : /* For the given rgroup_controls RGC, check whether an induction variable
   11824              :    would ever hit a value that produces a set of all-false masks or zero
   11825              :    lengths before wrapping around.  Return true if it's possible to wrap
   11826              :    around before hitting the desirable value, otherwise return false.  */
   11827              : 
   11828              : bool
   11829            0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
   11830              : {
   11831            0 :   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
   11832              : 
   11833            0 :   if (iv_limit == -1)
   11834              :     return true;
   11835              : 
   11836            0 :   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
   11837            0 :   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   11838            0 :   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
   11839              : 
   11840            0 :   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
   11841              :     return true;
   11842              : 
   11843              :   return false;
   11844            0 : }
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.