LCOV - code coverage report
Current view: top level - gcc/config/i386 - i386-expand.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 87.0 % 15023 13066
Test Date: 2026-04-20 14:57:17 Functions: 93.7 % 270 253
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
       2              : 
       3              : This file is part of GCC.
       4              : 
       5              : GCC is free software; you can redistribute it and/or modify
       6              : it under the terms of the GNU General Public License as published by
       7              : the Free Software Foundation; either version 3, or (at your option)
       8              : any later version.
       9              : 
      10              : GCC is distributed in the hope that it will be useful,
      11              : but WITHOUT ANY WARRANTY; without even the implied warranty of
      12              : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13              : GNU General Public License for more details.
      14              : 
      15              : You should have received a copy of the GNU General Public License
      16              : along with GCC; see the file COPYING3.  If not see
      17              : <http://www.gnu.org/licenses/>.  */
      18              : 
      19              : #define IN_TARGET_CODE 1
      20              : 
      21              : #include "config.h"
      22              : #include "system.h"
      23              : #include "coretypes.h"
      24              : #include "backend.h"
      25              : #include "rtl.h"
      26              : #include "tree.h"
      27              : #include "memmodel.h"
      28              : #include "gimple.h"
      29              : #include "cfghooks.h"
      30              : #include "cfgloop.h"
      31              : #include "df.h"
      32              : #include "tm_p.h"
      33              : #include "stringpool.h"
      34              : #include "expmed.h"
      35              : #include "optabs.h"
      36              : #include "regs.h"
      37              : #include "emit-rtl.h"
      38              : #include "recog.h"
      39              : #include "cgraph.h"
      40              : #include "diagnostic.h"
      41              : #include "cfgbuild.h"
      42              : #include "alias.h"
      43              : #include "fold-const.h"
      44              : #include "attribs.h"
      45              : #include "calls.h"
      46              : #include "stor-layout.h"
      47              : #include "varasm.h"
      48              : #include "output.h"
      49              : #include "insn-attr.h"
      50              : #include "flags.h"
      51              : #include "except.h"
      52              : #include "explow.h"
      53              : #include "expr.h"
      54              : #include "cfgrtl.h"
      55              : #include "common/common-target.h"
      56              : #include "langhooks.h"
      57              : #include "reload.h"
      58              : #include "gimplify.h"
      59              : #include "dwarf2.h"
      60              : #include "tm-constrs.h"
      61              : #include "cselib.h"
      62              : #include "sched-int.h"
      63              : #include "opts.h"
      64              : #include "tree-pass.h"
      65              : #include "context.h"
      66              : #include "pass_manager.h"
      67              : #include "target-globals.h"
      68              : #include "gimple-iterator.h"
      69              : #include "shrink-wrap.h"
      70              : #include "builtins.h"
      71              : #include "rtl-iter.h"
      72              : #include "tree-iterator.h"
      73              : #include "dbgcnt.h"
      74              : #include "case-cfn-macros.h"
      75              : #include "dojump.h"
      76              : #include "fold-const-call.h"
      77              : #include "tree-vrp.h"
      78              : #include "tree-ssanames.h"
      79              : #include "selftest.h"
      80              : #include "selftest-rtl.h"
      81              : #include "print-rtl.h"
      82              : #include "intl.h"
      83              : #include "ifcvt.h"
      84              : #include "symbol-summary.h"
      85              : #include "sreal.h"
      86              : #include "ipa-cp.h"
      87              : #include "ipa-prop.h"
      88              : #include "ipa-fnsummary.h"
      89              : #include "wide-int-bitmask.h"
      90              : #include "tree-vector-builder.h"
      91              : #include "debug.h"
      92              : #include "dwarf2out.h"
      93              : #include "i386-options.h"
      94              : #include "i386-builtins.h"
      95              : #include "i386-expand.h"
      96              : #include "asan.h"
      97              : 
      98              : /* Split one or more double-mode RTL references into pairs of half-mode
      99              :    references.  The RTL can be REG, offsettable MEM, integer constant, or
     100              :    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
     101              :    split and "num" is its length.  lo_half and hi_half are output arrays
     102              :    that parallel "operands".  */
     103              : 
     104              : void
     105      4160724 : split_double_mode (machine_mode mode, rtx operands[],
     106              :                    int num, rtx lo_half[], rtx hi_half[])
     107              : {
     108      4160724 :   machine_mode half_mode;
     109      4160724 :   unsigned int byte;
     110      4160724 :   rtx mem_op = NULL_RTX;
     111      4160724 :   int mem_num = 0;
     112              : 
     113      4160724 :   switch (mode)
     114              :     {
     115              :     case E_TImode:
     116              :       half_mode = DImode;
     117              :       break;
     118       607141 :     case E_DImode:
     119       607141 :       half_mode = SImode;
     120       607141 :       break;
     121            6 :     case E_P2HImode:
     122            6 :       half_mode = HImode;
     123            6 :       break;
     124           30 :     case E_P2QImode:
     125           30 :       half_mode = QImode;
     126           30 :       break;
     127            0 :     default:
     128            0 :       gcc_unreachable ();
     129              :     }
     130              : 
     131      4160724 :   byte = GET_MODE_SIZE (half_mode);
     132              : 
     133      8532615 :   while (num--)
     134              :     {
     135      4371891 :       rtx op = operands[num];
     136              : 
     137              :       /* simplify_subreg refuse to split volatile memory addresses,
     138              :          but we still have to handle it.  */
     139      4371891 :       if (MEM_P (op))
     140              :         {
     141      1740924 :           if (mem_op && rtx_equal_p (op, mem_op))
     142              :             {
     143         2432 :               lo_half[num] = lo_half[mem_num];
     144         2432 :               hi_half[num] = hi_half[mem_num];
     145              :             }
     146              :           else
     147              :             {
     148      1738492 :               mem_op = op;
     149      1738492 :               mem_num = num;
     150      1738492 :               lo_half[num] = adjust_address (op, half_mode, 0);
     151      1738492 :               hi_half[num] = adjust_address (op, half_mode, byte);
     152              :             }
     153              :         }
     154              :       else
     155              :         {
     156      2630967 :           lo_half[num] = simplify_gen_subreg (half_mode, op,
     157      2630967 :                                               GET_MODE (op) == VOIDmode
     158              :                                               ? mode : GET_MODE (op), 0);
     159              : 
     160      2630967 :           rtx tmp = simplify_gen_subreg (half_mode, op,
     161      2630967 :                                          GET_MODE (op) == VOIDmode
     162      2630967 :                                          ? mode : GET_MODE (op), byte);
     163              :           /* simplify_gen_subreg will return NULL RTX for the
     164              :              high half of the paradoxical subreg. */
     165      2630967 :           hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
     166              :         }
     167              :     }
     168      4160724 : }
     169              : 
     170              : /* Emit the double word assignment DST = { LO, HI }.  */
     171              : 
     172              : void
     173       100025 : split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
     174              : {
     175       100025 :   rtx dlo, dhi;
     176       100025 :   int deleted_move_count = 0;
     177       100025 :   split_double_mode (mode, &dst, 1, &dlo, &dhi);
     178              :   /* Constraints ensure that if both lo and hi are MEMs, then
     179              :      dst has early-clobber and thus addresses of MEMs don't use
     180              :      dlo/dhi registers.  Otherwise if at least one of li and hi are MEMs,
     181              :      dlo/dhi are registers.  */
     182       100025 :   if (MEM_P (lo)
     183         5537 :       && rtx_equal_p (dlo, hi)
     184       101003 :       && reg_overlap_mentioned_p (dhi, lo))
     185              :     {
     186              :       /* If dlo is same as hi and lo's address uses dhi register,
     187              :          code below would first emit_move_insn (dhi, hi)
     188              :          and then emit_move_insn (dlo, lo).  But the former
     189              :          would invalidate lo's address.  Load into dhi first,
     190              :          then swap.  */
     191          193 :       emit_move_insn (dhi, lo);
     192          193 :       lo = dhi;
     193              :     }
     194        99832 :   else if (MEM_P (hi)
     195         9461 :            && !MEM_P (lo)
     196         6645 :            && !rtx_equal_p (dlo, lo)
     197       101152 :            && reg_overlap_mentioned_p (dlo, hi))
     198              :     {
     199              :       /* In this case, code below would first emit_move_insn (dlo, lo)
     200              :          and then emit_move_insn (dhi, hi).  But the former would
     201              :          invalidate hi's address.  */
     202           15 :       if (rtx_equal_p (dhi, lo))
     203              :         {
     204              :           /* We can't load into dhi first, so load into dlo
     205              :              first and we'll swap.  */
     206            9 :           emit_move_insn (dlo, hi);
     207            9 :           hi = dlo;
     208              :         }
     209              :       else
     210              :         {
     211              :           /* Load into dhi first.  */
     212            6 :           emit_move_insn (dhi, hi);
     213            6 :           hi = dhi;
     214              :         }
     215              :     }
     216       100025 :   if (!rtx_equal_p (dlo, hi))
     217              :     {
     218        86187 :       if (!rtx_equal_p (dlo, lo))
     219        38214 :         emit_move_insn (dlo, lo);
     220              :       else
     221              :         deleted_move_count++;
     222        86187 :       if (!rtx_equal_p (dhi, hi))
     223        80125 :         emit_move_insn (dhi, hi);
     224              :       else
     225         6062 :         deleted_move_count++;
     226              :     }
     227        13838 :   else if (!rtx_equal_p (lo, dhi))
     228              :     {
     229         6863 :       if (!rtx_equal_p (dhi, hi))
     230         6863 :         emit_move_insn (dhi, hi);
     231              :       else
     232              :         deleted_move_count++;
     233         6863 :       if (!rtx_equal_p (dlo, lo))
     234         6763 :         emit_move_insn (dlo, lo);
     235              :       else
     236          100 :         deleted_move_count++;
     237              :     }
     238         6975 :   else if (mode == TImode)
     239         6957 :     emit_insn (gen_swapdi (dlo, dhi));
     240              :   else
     241           18 :     emit_insn (gen_swapsi (dlo, dhi));
     242              : 
     243       100025 :   if (deleted_move_count == 2)
     244         3116 :     emit_note (NOTE_INSN_DELETED);
     245       100025 : }
     246              : 
     247              : 
     248              : /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
     249              :    for the target.  */
     250              : 
     251              : void
     252       112807 : ix86_expand_clear (rtx dest)
     253              : {
     254       112807 :   rtx tmp;
     255              : 
     256              :   /* We play register width games, which are only valid after reload.  */
     257       112807 :   gcc_assert (reload_completed);
     258              : 
     259              :   /* Avoid HImode and its attendant prefix byte.  */
     260       225614 :   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
     261          990 :     dest = gen_rtx_REG (SImode, REGNO (dest));
     262       112807 :   tmp = gen_rtx_SET (dest, const0_rtx);
     263              : 
     264       112807 :   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
     265              :     {
     266       112807 :       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
     267       112807 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
     268              :     }
     269              : 
     270       112807 :   emit_insn (tmp);
     271       112807 : }
     272              : 
     273              : /* Return true if V can be broadcasted from an integer of WIDTH bits
     274              :    which is returned in VAL_BROADCAST.  Otherwise, return false.  */
     275              : 
     276              : static bool
     277         4851 : ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
     278              :                 HOST_WIDE_INT &val_broadcast)
     279              : {
     280         4851 :   wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
     281         4851 :   val_broadcast = wi::extract_uhwi (val, 0, width);
     282         6543 :   for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
     283              :     {
     284         5089 :       HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
     285         5089 :       if (val_broadcast != each)
     286              :         return false;
     287              :     }
     288         1454 :   val_broadcast = sext_hwi (val_broadcast, width);
     289         1454 :   return true;
     290         4851 : }
     291              : 
     292              : /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE.  */
     293              : 
     294              : rtx
     295        32844 : ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
     296              : {
     297              :   /* Don't use integer vector broadcast if we can't move from GPR to SSE
     298              :      register directly.  */
     299        32844 :   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
     300              :     return nullptr;
     301              : 
     302        32844 :   unsigned int msize = GET_MODE_SIZE (mode);
     303              : 
     304              :   /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm.  */
     305        32844 :   if (msize != 16 && msize != 32 && msize != 64)
     306              :     return nullptr;
     307              : 
     308              :   /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
     309              :      broadcast only if vector broadcast is available.  */
     310        32844 :   if (!TARGET_AVX
     311         1610 :       || !CONST_WIDE_INT_P (op)
     312         1603 :       || standard_sse_constant_p (op, mode)
     313        34447 :       || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
     314         1603 :           != GET_MODE_BITSIZE (mode)))
     315        31249 :     return nullptr;
     316              : 
     317         1595 :   HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
     318         1595 :   HOST_WIDE_INT val_broadcast;
     319         1595 :   scalar_int_mode broadcast_mode;
     320              :   /* vpbroadcastb zmm requires TARGET_AVX512BW.  */
     321          712 :   if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
     322         2089 :       && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
     323              :                          val_broadcast))
     324              :     broadcast_mode = QImode;
     325          654 :   else if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
     326         1968 :            && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
     327              :                               val_broadcast))
     328              :     broadcast_mode = HImode;
     329              :   /* vbroadcasts[sd] only support memory operand w/o AVX2.
     330              :      When msize == 16, pshufs is used for vec_duplicate.
     331              :      when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be existed.  */
     332          412 :   else if ((msize != 32 || TARGET_AVX2)
     333         1768 :            && ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
     334              :                            val_broadcast))
     335              :     broadcast_mode = SImode;
     336         1391 :   else if (TARGET_64BIT && (msize != 32 || TARGET_AVX2)
     337         2641 :            && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
     338              :                               val_broadcast))
     339              :     broadcast_mode = DImode;
     340              :   else
     341          141 :     return nullptr;
     342              : 
     343              :   /* Check if OP can be broadcasted from VAL.  */
     344         1776 :   for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
     345         1561 :     if (val != CONST_WIDE_INT_ELT (op, i))
     346              :       return nullptr;
     347              : 
     348          215 :   unsigned int nunits = (GET_MODE_SIZE (mode)
     349          215 :                          / GET_MODE_SIZE (broadcast_mode));
     350          215 :   machine_mode vector_mode;
     351          215 :   if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
     352            0 :     gcc_unreachable ();
     353          215 :   rtx target = gen_reg_rtx (vector_mode);
     354          215 :   bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
     355              :                                                target,
     356              :                                                GEN_INT (val_broadcast));
     357          215 :   if (!ok)
     358              :     return nullptr;
     359          215 :   target = lowpart_subreg (mode, target, vector_mode);
     360          215 :   return target;
     361              : }
     362              : 
     363              : void
     364     73026595 : ix86_expand_move (machine_mode mode, rtx operands[])
     365              : {
     366     73026595 :   rtx op0, op1;
     367     73026595 :   rtx tmp, addend = NULL_RTX;
     368     73026595 :   enum tls_model model;
     369              : 
     370     73026595 :   op0 = operands[0];
     371     73026595 :   op1 = operands[1];
     372              : 
     373              :   /* Avoid complex sets of likely spilled hard registers before reload.  */
     374     73026595 :   if (!ix86_hardreg_mov_ok (op0, op1))
     375              :     {
     376       138440 :       tmp = gen_reg_rtx (mode);
     377       138440 :       operands[0] = tmp;
     378       138440 :       ix86_expand_move (mode, operands);
     379       138440 :       operands[0] = op0;
     380       138440 :       operands[1] = tmp;
     381       138440 :       op1 = tmp;
     382              :     }
     383              : 
     384     73026595 :   switch (GET_CODE (op1))
     385              :     {
     386       347974 :     case CONST:
     387       347974 :       tmp = XEXP (op1, 0);
     388              : 
     389       347974 :       if (GET_CODE (tmp) != PLUS
     390       336286 :           || !SYMBOL_REF_P (XEXP (tmp, 0)))
     391              :         break;
     392              : 
     393       333623 :       op1 = XEXP (tmp, 0);
     394       333623 :       addend = XEXP (tmp, 1);
     395              :       /* FALLTHRU */
     396              : 
     397      4916627 :     case SYMBOL_REF:
     398      4916627 :       model = SYMBOL_REF_TLS_MODEL (op1);
     399              : 
     400      4916627 :       if (model)
     401        10114 :         op1 = legitimize_tls_address (op1, model, true);
     402      4906513 :       else if (ix86_force_load_from_GOT_p (op1))
     403              :         {
     404              :           /* Load the external function address via GOT slot to avoid PLT.  */
     405           24 :           op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
     406              :                                 (TARGET_64BIT
     407              :                                  ? UNSPEC_GOTPCREL
     408              :                                  : UNSPEC_GOT));
     409           24 :           op1 = gen_rtx_CONST (Pmode, op1);
     410           24 :           op1 = gen_const_mem (Pmode, op1);
     411           20 :           set_mem_alias_set (op1, GOT_ALIAS_SET);
     412              :         }
     413              :       else
     414              :         {
     415              : #if TARGET_PECOFF
     416              :           tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
     417              : 
     418              :           if (tmp)
     419              :             {
     420              :               op1 = tmp;
     421              :               if (!addend)
     422              :                 break;
     423              :             }
     424              :           else
     425              : #endif
     426      4906493 :             {
     427      4906493 :               op1 = operands[1];
     428      4906493 :               break;
     429              :             }
     430              :         }
     431              : 
     432        10134 :       if (addend)
     433              :         {
     434         2786 :           op1 = force_operand (op1, NULL_RTX);
     435         2795 :           op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
     436              :                                      op0, 1, OPTAB_DIRECT);
     437              :         }
     438              :       else
     439         7348 :         op1 = force_operand (op1, op0);
     440              : 
     441        10134 :       if (op1 == op0)
     442              :         return;
     443              : 
     444         1147 :       op1 = convert_to_mode (mode, op1, 1);
     445              : 
     446              :     default:
     447              :       break;
     448              : 
     449      1484838 :     case SUBREG:
     450              :       /* Transform TImode paradoxical SUBREG into zero_extendditi2.  */
     451      1484838 :       if (TARGET_64BIT
     452      1257174 :           && mode == TImode
     453              :           && SUBREG_P (op1)
     454        74281 :           && GET_MODE (SUBREG_REG (op1)) == DImode
     455      1530614 :           && SUBREG_BYTE (op1) == 0)
     456        45776 :         op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
     457              :       /* As not all values in XFmode are representable in real_value,
     458              :          we might be called with unfoldable SUBREGs of constants.  */
     459      1484838 :       if (mode == XFmode
     460         3130 :           && CONSTANT_P (SUBREG_REG (op1))
     461            0 :           && can_create_pseudo_p ())
     462              :         {
     463            0 :           machine_mode imode = GET_MODE (SUBREG_REG (op1));
     464            0 :           rtx r = force_const_mem (imode, SUBREG_REG (op1));
     465            0 :           if (r)
     466            0 :             r = validize_mem (r);
     467              :           else
     468            0 :             r = force_reg (imode, SUBREG_REG (op1));
     469            0 :           op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
     470              :         }
     471              :       break;
     472              :     }
     473              : 
     474     73017608 :   if ((flag_pic || MACHOPIC_INDIRECT)
     475     73017608 :       && symbolic_operand (op1, mode))
     476              :     {
     477              : #if TARGET_MACHO
     478              :       if (TARGET_MACHO && !TARGET_64BIT)
     479              :         {
     480              :           /* dynamic-no-pic */
     481              :           if (MACHOPIC_INDIRECT)
     482              :             {
     483              :               tmp = (op0 && REG_P (op0) && mode == Pmode)
     484              :                     ? op0 : gen_reg_rtx (Pmode);
     485              :               op1 = machopic_indirect_data_reference (op1, tmp);
     486              :               if (MACHOPIC_PURE)
     487              :                 op1 = machopic_legitimize_pic_address (op1, mode,
     488              :                                                        tmp == op1 ? 0 : tmp);
     489              :             }
     490              :           if (op0 != op1 && !MEM_P (op0))
     491              :             {
     492              :               rtx insn = gen_rtx_SET (op0, op1);
     493              :               emit_insn (insn);
     494              :               return;
     495              :             }
     496              :         }
     497              : #endif
     498              : 
     499       335809 :       if (MEM_P (op0))
     500        87509 :         op1 = force_reg (mode, op1);
     501       248300 :       else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
     502              :         {
     503       248243 :           rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
     504       248243 :           op1 = legitimize_pic_address (op1, reg);
     505       248243 :           if (op0 == op1)
     506              :             return;
     507       248243 :           op1 = convert_to_mode (mode, op1, 1);
     508              :         }
     509              :     }
     510              :   else
     511              :     {
     512     72681799 :       if (MEM_P (op0)
     513     99212104 :           && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
     514     10723599 :               || !push_operand (op0, mode))
     515     84947231 :           && MEM_P (op1))
     516      2162751 :         op1 = force_reg (mode, op1);
     517              : 
     518     72681799 :       if (push_operand (op0, mode)
     519     72681799 :           && ! general_no_elim_operand (op1, mode))
     520         1004 :         op1 = copy_to_mode_reg (mode, op1);
     521              : 
     522              :       /* Force large constants in 64bit compilation into register
     523              :          to get them CSEed.  */
     524     72681799 :       if (can_create_pseudo_p ()
     525     66983393 :           && (mode == DImode) && TARGET_64BIT
     526     34867395 :           && immediate_operand (op1, mode)
     527      7898982 :           && !x86_64_zext_immediate_operand (op1, VOIDmode)
     528       723270 :           && !register_operand (op0, mode)
     529     72856817 :           && optimize)
     530       123293 :         op1 = copy_to_mode_reg (mode, op1);
     531              : 
     532     72681799 :       if (can_create_pseudo_p ())
     533              :         {
     534     66983393 :           if (CONST_DOUBLE_P (op1))
     535              :             {
     536              :               /* If we are loading a floating point constant to a
     537              :                  register, force the value to memory now, since we'll
     538              :                  get better code out the back end.  */
     539              : 
     540       895645 :               op1 = validize_mem (force_const_mem (mode, op1));
     541       895645 :               if (!register_operand (op0, mode))
     542              :                 {
     543       129759 :                   tmp = gen_reg_rtx (mode);
     544       129759 :                   emit_insn (gen_rtx_SET (tmp, op1));
     545       129759 :                   emit_move_insn (op0, tmp);
     546       129759 :                   return;
     547              :                 }
     548              :             }
     549              :         }
     550              :     }
     551              : 
     552              :   /* Special case inserting 64-bit values into a TImode register.  */
     553     72887849 :   if (TARGET_64BIT
     554              :       /* Disable for -O0 (see PR110587) unless naked (PR110533).  */
     555     63164632 :       && (optimize || ix86_function_naked (current_function_decl))
     556     43264495 :       && (mode == DImode || mode == DFmode)
     557     29493119 :       && SUBREG_P (op0)
     558       480002 :       && GET_MODE (SUBREG_REG (op0)) == TImode
     559       397582 :       && REG_P (SUBREG_REG (op0))
     560     73285431 :       && REG_P (op1))
     561              :     {
     562              :       /* Use *insvti_lowpart_1 to set lowpart.  */
     563       177523 :       if (SUBREG_BYTE (op0) == 0)
     564              :         {
     565        52518 :           wide_int mask = wi::mask (64, true, 128);
     566        52518 :           tmp = immed_wide_int_const (mask, TImode);
     567        52518 :           op0 = SUBREG_REG (op0);
     568        52518 :           tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
     569        52518 :           if (mode == DFmode)
     570          355 :             op1 = gen_lowpart (DImode, op1);
     571        52518 :           op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
     572        52518 :           op1 = gen_rtx_IOR (TImode, tmp, op1);
     573        52518 :         }
     574              :       /* Use *insvti_highpart_1 to set highpart.  */
     575       125005 :       else if (SUBREG_BYTE (op0) == 8)
     576              :         {
     577       125005 :           wide_int mask = wi::mask (64, false, 128);
     578       125005 :           tmp = immed_wide_int_const (mask, TImode);
     579       125005 :           op0 = SUBREG_REG (op0);
     580       125005 :           tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
     581       125005 :           if (mode == DFmode)
     582          206 :             op1 = gen_lowpart (DImode, op1);
     583       125005 :           op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
     584       125005 :           op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
     585       125005 :           op1 = gen_rtx_IOR (TImode, tmp, op1);
     586       125005 :         }
     587              :     }
     588              : 
     589     72887849 :   emit_insn (gen_rtx_SET (op0, op1));
     590              : }
     591              : 
     592              : /* OP is a memref of CONST_VECTOR, return scalar constant mem
     593              :    if CONST_VECTOR is a vec_duplicate, else return NULL.  */
     594              : rtx
     595      2462446 : ix86_broadcast_from_constant (machine_mode mode, rtx op)
     596              : {
     597      2462446 :   int nunits = GET_MODE_NUNITS (mode);
     598      2462446 :   if (nunits < 2)
     599              :     return nullptr;
     600              : 
     601              :   /* Don't use integer vector broadcast if we can't move from GPR to SSE
     602              :      register directly.  */
     603      2333928 :   if (!TARGET_INTER_UNIT_MOVES_TO_VEC
     604         8172 :       && INTEGRAL_MODE_P (mode))
     605              :     return nullptr;
     606              : 
     607              :   /* Convert CONST_VECTOR to a non-standard SSE constant integer
     608              :      broadcast only if vector broadcast is available.  */
     609      2328366 :   if (standard_sse_constant_p (op, mode))
     610              :     return nullptr;
     611              : 
     612      4656726 :   if (GET_MODE_INNER (mode) == TImode)
     613              :     return nullptr;
     614              : 
     615      2328253 :   rtx constant = get_pool_constant (XEXP (op, 0));
     616      2328253 :   if (!CONST_VECTOR_P (constant))
     617              :     return nullptr;
     618              : 
     619              :   /* There could be some rtx like
     620              :      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
     621              :      but with "*.LC1" refer to V2DI constant vector.  */
     622      2328253 :   if (GET_MODE (constant) != mode)
     623              :     {
     624          659 :       constant = simplify_subreg (mode, constant, GET_MODE (constant),
     625              :                                   0);
     626          659 :       if (constant == nullptr || !CONST_VECTOR_P (constant))
     627              :         return nullptr;
     628              :     }
     629              : 
     630      2328253 :   rtx first = XVECEXP (constant, 0, 0);
     631              : 
     632      7692460 :   for (int i = 1; i < nunits; ++i)
     633              :     {
     634      7075952 :       rtx tmp = XVECEXP (constant, 0, i);
     635              :       /* Vector duplicate value.  */
     636      7075952 :       if (!rtx_equal_p (tmp, first))
     637              :         return nullptr;
     638              :     }
     639              : 
     640              :   return first;
     641              : }
     642              : 
     643              : void
     644      4799117 : ix86_expand_vector_move (machine_mode mode, rtx operands[])
     645              : {
     646      4799117 :   rtx op0 = operands[0], op1 = operands[1];
     647              :   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
     648              :      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
     649      4799117 :   unsigned int align = (TARGET_IAMCU
     650      4799117 :                         ? GET_MODE_BITSIZE (mode)
     651      4799117 :                         : GET_MODE_ALIGNMENT (mode));
     652              : 
     653      4799117 :   if (push_operand (op0, VOIDmode))
     654         2899 :     op0 = emit_move_resolve_push (mode, op0);
     655              : 
     656              :   /* Force constants other than zero into memory.  We do not know how
     657              :      the instructions used to build constants modify the upper 64 bits
     658              :      of the register, once we have that information we may be able
     659              :      to handle some of them more efficiently.  */
     660      4799117 :   if (can_create_pseudo_p ()
     661      4604430 :       && (CONSTANT_P (op1)
     662      4291603 :           || (SUBREG_P (op1)
     663       310973 :               && CONSTANT_P (SUBREG_REG (op1))))
     664      5111958 :       && ((register_operand (op0, mode)
     665       259151 :            && !standard_sse_constant_p (op1, mode))
     666              :           /* ix86_expand_vector_move_misalign() does not like constants.  */
     667              :           || (SSE_REG_MODE_P (mode)
     668       255862 :               && MEM_P (op0)
     669        38241 :               && MEM_ALIGN (op0) < align)))
     670              :     {
     671         2236 :       if (SUBREG_P (op1))
     672              :         {
     673           14 :           machine_mode imode = GET_MODE (SUBREG_REG (op1));
     674           14 :           rtx r = force_const_mem (imode, SUBREG_REG (op1));
     675           14 :           if (r)
     676           14 :             r = validize_mem (r);
     677              :           else
     678            0 :             r = force_reg (imode, SUBREG_REG (op1));
     679           14 :           op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
     680              :         }
     681              :       else
     682              :         {
     683         2222 :           machine_mode mode = GET_MODE (op0);
     684         2222 :           rtx tmp = ix86_convert_const_wide_int_to_broadcast
     685         2222 :             (mode, op1);
     686         2222 :           if (tmp == nullptr)
     687         2201 :             op1 = validize_mem (force_const_mem (mode, op1));
     688              :           else
     689              :             op1 = tmp;
     690              :         }
     691              :     }
     692              : 
     693      4799117 :   if (can_create_pseudo_p ()
     694      4604430 :       && GET_MODE_SIZE (mode) >= 16
     695      3894618 :       && VECTOR_MODE_P (mode)
     696      8478337 :       && (MEM_P (op1)
     697       887977 :           && SYMBOL_REF_P (XEXP (op1, 0))
     698       491799 :           && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
     699              :     {
     700       475178 :       rtx first = ix86_broadcast_from_constant (mode, op1);
     701       475178 :       if (first != nullptr)
     702              :         {
     703              :           /* Broadcast to XMM/YMM/ZMM register from an integer
     704              :              constant or scalar mem.  */
     705       121547 :           rtx tmp = gen_reg_rtx (mode);
     706       121547 :           if (FLOAT_MODE_P (mode))
     707        29228 :             first = force_const_mem (GET_MODE_INNER (mode), first);
     708       121547 :           bool ok = ix86_expand_vector_init_duplicate (false, mode,
     709              :                                                        tmp, first);
     710       121547 :           if (!ok && !TARGET_64BIT && GET_MODE_INNER (mode) == DImode)
     711              :             {
     712            0 :               first = force_const_mem (GET_MODE_INNER (mode), first);
     713            0 :               ok = ix86_expand_vector_init_duplicate (false, mode,
     714              :                                                       tmp, first);
     715              :             }
     716       121547 :           if (ok)
     717              :             {
     718       121547 :               emit_move_insn (op0, tmp);
     719       121547 :               return;
     720              :             }
     721              :         }
     722              :     }
     723              : 
     724              :   /* We need to check memory alignment for SSE mode since attribute
     725              :      can make operands unaligned.  */
     726      4677570 :   if (can_create_pseudo_p ()
     727              :       && SSE_REG_MODE_P (mode)
     728      9501641 :       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
     729      4224767 :           || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
     730              :     {
     731       509840 :       rtx tmp[2];
     732              : 
     733              :       /* ix86_expand_vector_move_misalign() does not like both
     734              :          arguments in memory.  */
     735       509840 :       if (!register_operand (op0, mode)
     736       509840 :           && !register_operand (op1, mode))
     737              :         {
     738       160966 :           rtx scratch = gen_reg_rtx (mode);
     739       160966 :           emit_move_insn (scratch, op1);
     740       160966 :           op1 = scratch;
     741              :         }
     742              : 
     743       509840 :       tmp[0] = op0; tmp[1] = op1;
     744       509840 :       ix86_expand_vector_move_misalign (mode, tmp);
     745       509840 :       return;
     746              :     }
     747              : 
     748              :   /* Special case TImode to 128-bit vector conversions via V2DI.  */
     749      1138529 :   if (VECTOR_MODE_P (mode)
     750      4116412 :       && GET_MODE_SIZE (mode) == 16
     751      2908451 :       && SUBREG_P (op1)
     752       242295 :       && GET_MODE (SUBREG_REG (op1)) == TImode
     753         3219 :       && TARGET_64BIT && TARGET_SSE
     754      4170296 :       && ix86_pre_reload_split ())
     755              :     {
     756         2460 :       rtx tmp = gen_reg_rtx (V2DImode);
     757         2460 :       rtx lo = gen_reg_rtx (DImode);
     758         2460 :       rtx hi = gen_reg_rtx (DImode);
     759         2460 :       emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
     760         2460 :       emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
     761         2460 :       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
     762         2460 :       emit_move_insn (op0, gen_lowpart (mode, tmp));
     763         2460 :       return;
     764              :     }
     765              : 
     766              :   /* If operand0 is a hard register, make operand1 a pseudo.  */
     767      4165270 :   if (can_create_pseudo_p ()
     768      8135853 :       && !ix86_hardreg_mov_ok (op0, op1))
     769              :     {
     770          135 :       rtx tmp = gen_reg_rtx (GET_MODE (op0));
     771          135 :       emit_move_insn (tmp, op1);
     772          135 :       emit_move_insn (op0, tmp);
     773          135 :       return;
     774              :     }
     775              : 
     776              :   /* Make operand1 a register if it isn't already.  */
     777      4165135 :   if (can_create_pseudo_p ()
     778      3970448 :       && !register_operand (op0, mode)
     779      5275067 :       && !register_operand (op1, mode))
     780              :     {
     781       214224 :       rtx tmp = gen_reg_rtx (GET_MODE (op0));
     782       214224 :       emit_move_insn (tmp, op1);
     783       214224 :       emit_move_insn (op0, tmp);
     784       214224 :       return;
     785              :     }
     786              : 
     787      3950911 :   emit_insn (gen_rtx_SET (op0, op1));
     788              : }
     789              : 
     790              : /* Split 32-byte AVX unaligned load and store if needed.  */
     791              : 
     792              : static void
     793        13475 : ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
     794              : {
     795        13475 :   rtx m;
     796        13475 :   rtx (*extract) (rtx, rtx, rtx);
     797        13475 :   machine_mode mode;
     798              : 
     799        13475 :   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
     800         4755 :       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
     801              :     {
     802        13423 :       emit_insn (gen_rtx_SET (op0, op1));
     803        13423 :       return;
     804              :     }
     805              : 
     806           52 :   rtx orig_op0 = NULL_RTX;
     807           52 :   mode = GET_MODE (op0);
     808           52 :   switch (GET_MODE_CLASS (mode))
     809              :     {
     810           35 :     case MODE_VECTOR_INT:
     811           35 :     case MODE_INT:
     812           35 :       if (mode != V32QImode)
     813              :         {
     814            7 :           if (!MEM_P (op0))
     815              :             {
     816            3 :               orig_op0 = op0;
     817            3 :               op0 = gen_reg_rtx (V32QImode);
     818              :             }
     819              :           else
     820            4 :             op0 = gen_lowpart (V32QImode, op0);
     821            7 :           op1 = gen_lowpart (V32QImode, op1);
     822            7 :           mode = V32QImode;
     823              :         }
     824              :       break;
     825              :     case MODE_VECTOR_FLOAT:
     826              :       break;
     827            0 :     default:
     828            0 :       gcc_unreachable ();
     829              :     }
     830              : 
     831           52 :   switch (mode)
     832              :     {
     833            0 :     default:
     834            0 :       gcc_unreachable ();
     835              :     case E_V32QImode:
     836              :       extract = gen_avx_vextractf128v32qi;
     837              :       mode = V16QImode;
     838              :       break;
     839            1 :     case E_V16BFmode:
     840            1 :       extract = gen_avx_vextractf128v16bf;
     841            1 :       mode = V8BFmode;
     842            1 :       break;
     843            0 :     case E_V16HFmode:
     844            0 :       extract = gen_avx_vextractf128v16hf;
     845            0 :       mode = V8HFmode;
     846            0 :       break;
     847            8 :     case E_V8SFmode:
     848            8 :       extract = gen_avx_vextractf128v8sf;
     849            8 :       mode = V4SFmode;
     850            8 :       break;
     851            8 :     case E_V4DFmode:
     852            8 :       extract = gen_avx_vextractf128v4df;
     853            8 :       mode = V2DFmode;
     854            8 :       break;
     855              :     }
     856              : 
     857           52 :   if (MEM_P (op1))
     858              :     {
     859            9 :       rtx r = gen_reg_rtx (mode);
     860            9 :       m = adjust_address (op1, mode, 0);
     861            9 :       emit_move_insn (r, m);
     862            9 :       m = adjust_address (op1, mode, 16);
     863            9 :       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
     864            9 :       emit_move_insn (op0, r);
     865              :     }
     866           43 :   else if (MEM_P (op0))
     867              :     {
     868           43 :       m = adjust_address (op0, mode, 0);
     869           43 :       emit_insn (extract (m, op1, const0_rtx));
     870           43 :       m = adjust_address (op0, mode, 16);
     871           43 :       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
     872              :     }
     873              :   else
     874            0 :     gcc_unreachable ();
     875              : 
     876           52 :   if (orig_op0)
     877            3 :     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
     878              : }
     879              : 
     880              : /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
     881              :    straight to ix86_expand_vector_move.  */
     882              : /* Code generation for scalar reg-reg moves of single and double precision data:
     883              :      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
     884              :        movaps reg, reg
     885              :      else
     886              :        movss reg, reg
     887              :      if (x86_sse_partial_reg_dependency == true)
     888              :        movapd reg, reg
     889              :      else
     890              :        movsd reg, reg
     891              : 
     892              :    Code generation for scalar loads of double precision data:
     893              :      if (x86_sse_split_regs == true)
     894              :        movlpd mem, reg      (gas syntax)
     895              :      else
     896              :        movsd mem, reg
     897              : 
     898              :    Code generation for unaligned packed loads of single precision data
     899              :    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
     900              :      if (x86_sse_unaligned_move_optimal)
     901              :        movups mem, reg
     902              : 
     903              :      if (x86_sse_partial_reg_dependency == true)
     904              :        {
     905              :          xorps  reg, reg
     906              :          movlps mem, reg
     907              :          movhps mem+8, reg
     908              :        }
     909              :      else
     910              :        {
     911              :          movlps mem, reg
     912              :          movhps mem+8, reg
     913              :        }
     914              : 
     915              :    Code generation for unaligned packed loads of double precision data
     916              :    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
     917              :      if (x86_sse_unaligned_move_optimal)
     918              :        movupd mem, reg
     919              : 
     920              :      if (x86_sse_split_regs == true)
     921              :        {
     922              :          movlpd mem, reg
     923              :          movhpd mem+8, reg
     924              :        }
     925              :      else
     926              :        {
     927              :          movsd  mem, reg
     928              :          movhpd mem+8, reg
     929              :        }
     930              :  */
     931              : 
     932              : void
     933       838473 : ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
     934              : {
     935       838473 :   rtx op0, op1, m;
     936              : 
     937       838473 :   op0 = operands[0];
     938       838473 :   op1 = operands[1];
     939              : 
     940              :   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
     941      1676946 :   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
     942              :     {
     943        24108 :       emit_insn (gen_rtx_SET (op0, op1));
     944        24108 :       return;
     945              :     }
     946              : 
     947       814365 :   if (TARGET_AVX)
     948              :     {
     949        62732 :       if (GET_MODE_SIZE (mode) == 32)
     950        13475 :         ix86_avx256_split_vector_move_misalign (op0, op1);
     951              :       else
     952              :         /* Always use 128-bit mov<mode>_internal pattern for AVX.  */
     953        17891 :         emit_insn (gen_rtx_SET (op0, op1));
     954        31366 :       return;
     955              :     }
     956              : 
     957       782999 :   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
     958           95 :       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
     959              :     {
     960       782904 :       emit_insn (gen_rtx_SET (op0, op1));
     961       782904 :       return;
     962              :     }
     963              : 
     964              :   /* ??? If we have typed data, then it would appear that using
     965              :      movdqu is the only way to get unaligned data loaded with
     966              :      integer type.  */
     967           95 :   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
     968              :     {
     969           81 :       emit_insn (gen_rtx_SET (op0, op1));
     970           81 :       return;
     971              :     }
     972              : 
     973           14 :   if (MEM_P (op1))
     974              :     {
     975            6 :       if (TARGET_SSE2 && mode == V2DFmode)
     976              :         {
     977            2 :           rtx zero;
     978              : 
     979              :           /* When SSE registers are split into halves, we can avoid
     980              :              writing to the top half twice.  */
     981            2 :           if (TARGET_SSE_SPLIT_REGS)
     982              :             {
     983            2 :               emit_clobber (op0);
     984            2 :               zero = op0;
     985              :             }
     986              :           else
     987              :             {
     988              :               /* ??? Not sure about the best option for the Intel chips.
     989              :                  The following would seem to satisfy; the register is
     990              :                  entirely cleared, breaking the dependency chain.  We
     991              :                  then store to the upper half, with a dependency depth
     992              :                  of one.  A rumor has it that Intel recommends two movsd
     993              :                  followed by an unpacklpd, but this is unconfirmed.  And
     994              :                  given that the dependency depth of the unpacklpd would
     995              :                  still be one, I'm not sure why this would be better.  */
     996            0 :               zero = CONST0_RTX (V2DFmode);
     997              :             }
     998              : 
     999            2 :           m = adjust_address (op1, DFmode, 0);
    1000            2 :           emit_insn (gen_sse2_loadlpd (op0, zero, m));
    1001            2 :           m = adjust_address (op1, DFmode, 8);
    1002            2 :           emit_insn (gen_sse2_loadhpd (op0, op0, m));
    1003            2 :         }
    1004              :       else
    1005              :         {
    1006            4 :           rtx t;
    1007              : 
    1008            4 :           if (mode != V4SFmode)
    1009            0 :             t = gen_reg_rtx (V4SFmode);
    1010              :           else
    1011              :             t = op0;
    1012              : 
    1013            4 :           if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
    1014            2 :             emit_move_insn (t, CONST0_RTX (V4SFmode));
    1015              :           else
    1016            2 :             emit_clobber (t);
    1017              : 
    1018            4 :           m = adjust_address (op1, V2SFmode, 0);
    1019            4 :           emit_insn (gen_sse_loadlps (t, t, m));
    1020            4 :           m = adjust_address (op1, V2SFmode, 8);
    1021            4 :           emit_insn (gen_sse_loadhps (t, t, m));
    1022            4 :           if (mode != V4SFmode)
    1023            0 :             emit_move_insn (op0, gen_lowpart (mode, t));
    1024              :         }
    1025              :     }
    1026            8 :   else if (MEM_P (op0))
    1027              :     {
    1028            8 :       if (TARGET_SSE2 && mode == V2DFmode)
    1029              :         {
    1030            2 :           m = adjust_address (op0, DFmode, 0);
    1031            2 :           emit_insn (gen_sse2_storelpd (m, op1));
    1032            2 :           m = adjust_address (op0, DFmode, 8);
    1033            2 :           emit_insn (gen_sse2_storehpd (m, op1));
    1034              :         }
    1035              :       else
    1036              :         {
    1037            6 :           if (mode != V4SFmode)
    1038            0 :             op1 = gen_lowpart (V4SFmode, op1);
    1039              : 
    1040            6 :           m = adjust_address (op0, V2SFmode, 0);
    1041            6 :           emit_insn (gen_sse_storelps (m, op1));
    1042            6 :           m = adjust_address (op0, V2SFmode, 8);
    1043            6 :           emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
    1044              :         }
    1045              :     }
    1046              :   else
    1047            0 :     gcc_unreachable ();
    1048              : }
    1049              : 
    1050              : /* Move bits 64:95 to bits 32:63.  */
    1051              : 
    1052              : void
    1053          868 : ix86_move_vector_high_sse_to_mmx (rtx op)
    1054              : {
    1055          868 :   rtx mask = gen_rtx_PARALLEL (VOIDmode,
    1056              :                                gen_rtvec (4, GEN_INT (0), GEN_INT (2),
    1057              :                                           GEN_INT (0), GEN_INT (0)));
    1058          868 :   rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
    1059          868 :   op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
    1060          868 :   rtx insn = gen_rtx_SET (dest, op);
    1061          868 :   emit_insn (insn);
    1062          868 : }
    1063              : 
    1064              : /* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
    1065              : 
    1066              : void
    1067          778 : ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
    1068              : {
    1069          778 :   rtx op0 = operands[0];
    1070          778 :   rtx op1 = operands[1];
    1071          778 :   rtx op2 = operands[2];
    1072          778 :   rtx src;
    1073              : 
    1074          778 :   machine_mode dmode = GET_MODE (op0);
    1075          778 :   machine_mode smode = GET_MODE (op1);
    1076          778 :   machine_mode inner_dmode = GET_MODE_INNER (dmode);
    1077          778 :   machine_mode inner_smode = GET_MODE_INNER (smode);
    1078              : 
    1079              :   /* Get the corresponding SSE mode for destination.  */
    1080          778 :   int nunits = 16 / GET_MODE_SIZE (inner_dmode);
    1081         1556 :   machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
    1082         1556 :                                             nunits).require ();
    1083          778 :   machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
    1084         1556 :                                                  nunits / 2).require ();
    1085              : 
    1086              :   /* Get the corresponding SSE mode for source.  */
    1087          778 :   nunits = 16 / GET_MODE_SIZE (inner_smode);
    1088         1556 :   machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
    1089         1556 :                                             nunits).require ();
    1090              : 
    1091              :   /* Generate SSE pack with signed/unsigned saturation.  */
    1092          778 :   rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
    1093          778 :   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
    1094          778 :   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
    1095              : 
    1096              :   /* paskusdw/packuswb does unsigned saturation of a signed source
    1097              :      which is different from generic us_truncate RTX.  */
    1098          778 :   if (code == US_TRUNCATE)
    1099          676 :     src = gen_rtx_UNSPEC (sse_dmode,
    1100              :                           gen_rtvec (2, op1, op2),
    1101              :                           UNSPEC_US_TRUNCATE);
    1102              :   else
    1103              :     {
    1104          102 :       op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
    1105          102 :       op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
    1106          102 :       src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
    1107              :     }
    1108              : 
    1109          778 :   emit_move_insn (dest, src);
    1110              : 
    1111          778 :   ix86_move_vector_high_sse_to_mmx (op0);
    1112          778 : }
    1113              : 
    1114              : /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  This is also used
    1115              :    for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
    1116              :    OPERANDS[0].  */
    1117              : 
    1118              : void
    1119         6027 : ix86_split_mmx_punpck (rtx operands[], bool high_p)
    1120              : {
    1121         6027 :   rtx op0 = operands[0];
    1122         6027 :   rtx op1 = operands[1];
    1123         6027 :   rtx op2 = operands[2];
    1124         6027 :   machine_mode mode = GET_MODE (op1);
    1125         6027 :   rtx mask;
    1126              :   /* The corresponding SSE mode.  */
    1127         6027 :   machine_mode sse_mode, double_sse_mode;
    1128              : 
    1129         6027 :   switch (mode)
    1130              :     {
    1131         1582 :     case E_V8QImode:
    1132         1582 :     case E_V4QImode:
    1133         1582 :     case E_V2QImode:
    1134         1582 :       sse_mode = V16QImode;
    1135         1582 :       double_sse_mode = V32QImode;
    1136         1582 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1137              :                                gen_rtvec (16,
    1138              :                                           GEN_INT (0), GEN_INT (16),
    1139              :                                           GEN_INT (1), GEN_INT (17),
    1140              :                                           GEN_INT (2), GEN_INT (18),
    1141              :                                           GEN_INT (3), GEN_INT (19),
    1142              :                                           GEN_INT (4), GEN_INT (20),
    1143              :                                           GEN_INT (5), GEN_INT (21),
    1144              :                                           GEN_INT (6), GEN_INT (22),
    1145              :                                           GEN_INT (7), GEN_INT (23)));
    1146         1582 :       break;
    1147              : 
    1148         3316 :     case E_V4HImode:
    1149         3316 :     case E_V2HImode:
    1150         3316 :       sse_mode = V8HImode;
    1151         3316 :       double_sse_mode = V16HImode;
    1152         3316 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1153              :                                gen_rtvec (8,
    1154              :                                           GEN_INT (0), GEN_INT (8),
    1155              :                                           GEN_INT (1), GEN_INT (9),
    1156              :                                           GEN_INT (2), GEN_INT (10),
    1157              :                                           GEN_INT (3), GEN_INT (11)));
    1158         3316 :       break;
    1159              : 
    1160          778 :     case E_V2SImode:
    1161          778 :       sse_mode = V4SImode;
    1162          778 :       double_sse_mode = V8SImode;
    1163          778 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1164              :                                gen_rtvec (4,
    1165              :                                           GEN_INT (0), GEN_INT (4),
    1166              :                                           GEN_INT (1), GEN_INT (5)));
    1167          778 :       break;
    1168              : 
    1169          351 :     case E_V2SFmode:
    1170          351 :       sse_mode = V4SFmode;
    1171          351 :       double_sse_mode = V8SFmode;
    1172          351 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1173              :                                gen_rtvec (4,
    1174              :                                           GEN_INT (0), GEN_INT (4),
    1175              :                                           GEN_INT (1), GEN_INT (5)));
    1176          351 :       break;
    1177              : 
    1178            0 :     default:
    1179            0 :       gcc_unreachable ();
    1180              :     }
    1181              : 
    1182              :   /* Generate SSE punpcklXX.  */
    1183         6027 :   rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
    1184         6027 :   op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
    1185         6027 :   op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
    1186              : 
    1187         6027 :   op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
    1188         6027 :   op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
    1189         6027 :   rtx insn = gen_rtx_SET (dest, op2);
    1190         6027 :   emit_insn (insn);
    1191              : 
    1192              :   /* Move high bits to low bits.  */
    1193         6027 :   if (high_p)
    1194              :     {
    1195         2429 :       if (sse_mode == V4SFmode)
    1196              :         {
    1197          121 :           mask = gen_rtx_PARALLEL (VOIDmode,
    1198              :                                    gen_rtvec (4, GEN_INT (2), GEN_INT (3),
    1199              :                                               GEN_INT (4), GEN_INT (5)));
    1200          121 :           op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
    1201          121 :           op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
    1202              :         }
    1203              :       else
    1204              :         {
    1205         2308 :           int sz = GET_MODE_SIZE (mode);
    1206              : 
    1207         2308 :           if (sz == 4)
    1208          239 :             mask = gen_rtx_PARALLEL (VOIDmode,
    1209              :                                      gen_rtvec (4, GEN_INT (1), GEN_INT (0),
    1210              :                                                 GEN_INT (0), GEN_INT (1)));
    1211         2069 :           else if (sz == 8)
    1212         2069 :             mask = gen_rtx_PARALLEL (VOIDmode,
    1213              :                                      gen_rtvec (4, GEN_INT (2), GEN_INT (3),
    1214              :                                                 GEN_INT (0), GEN_INT (1)));
    1215              :           else
    1216            0 :             gcc_unreachable ();
    1217              : 
    1218         2308 :           dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
    1219         2308 :           op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
    1220              :         }
    1221              : 
    1222         2429 :       insn = gen_rtx_SET (dest, op1);
    1223         2429 :       emit_insn (insn);
    1224              :     }
    1225         6027 : }
    1226              : 
    1227              : /* Helper function of ix86_fixup_binary_operands to canonicalize
    1228              :    operand order.  Returns true if the operands should be swapped.  */
    1229              : 
    1230              : static bool
    1231    173561513 : ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
    1232              :                              rtx operands[])
    1233              : {
    1234    173561513 :   rtx dst = operands[0];
    1235    173561513 :   rtx src1 = operands[1];
    1236    173561513 :   rtx src2 = operands[2];
    1237              : 
    1238              :   /* If the operation is not commutative, we can't do anything.  */
    1239    173561513 :   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
    1240     26289958 :       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
    1241              :     return false;
    1242              : 
    1243              :   /* Highest priority is that src1 should match dst.  */
    1244    147282784 :   if (rtx_equal_p (dst, src1))
    1245              :     return false;
    1246    107068271 :   if (rtx_equal_p (dst, src2))
    1247              :     return true;
    1248              : 
    1249              :   /* Next highest priority is that immediate constants come second.  */
    1250    106983972 :   if (immediate_operand (src2, mode))
    1251              :     return false;
    1252     25794075 :   if (immediate_operand (src1, mode))
    1253              :     return true;
    1254              : 
    1255              :   /* Lowest priority is that memory references should come second.  */
    1256     25794075 :   if (MEM_P (src2))
    1257              :     return false;
    1258     24369593 :   if (MEM_P (src1))
    1259              :     return true;
    1260              : 
    1261              :   return false;
    1262              : }
    1263              : 
    1264              : /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
    1265              :    destination to use for the operation.  If different from the true
    1266              :    destination in operands[0], a copy operation will be required except
    1267              :    under TARGET_APX_NDD.  */
    1268              : 
    1269              : rtx
    1270     13484290 : ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
    1271              :                             rtx operands[], bool use_ndd)
    1272              : {
    1273     13484290 :   rtx dst = operands[0];
    1274     13484290 :   rtx src1 = operands[1];
    1275     13484290 :   rtx src2 = operands[2];
    1276              : 
    1277              :   /* Canonicalize operand order.  */
    1278     13484290 :   if (ix86_swap_binary_operands_p (code, mode, operands))
    1279              :     {
    1280              :       /* It is invalid to swap operands of different modes.  */
    1281        87854 :       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
    1282              : 
    1283              :       std::swap (src1, src2);
    1284              :     }
    1285              : 
    1286              :   /* Both source operands cannot be in memory.  */
    1287     13484290 :   if (MEM_P (src1) && MEM_P (src2))
    1288              :     {
    1289              :       /* Optimization: Only read from memory once.  */
    1290       109858 :       if (rtx_equal_p (src1, src2))
    1291              :         {
    1292           17 :           src2 = force_reg (mode, src2);
    1293           17 :           src1 = src2;
    1294              :         }
    1295       109841 :       else if (rtx_equal_p (dst, src1))
    1296         3327 :         src2 = force_reg (mode, src2);
    1297              :       else
    1298       106514 :         src1 = force_reg (mode, src1);
    1299              :     }
    1300              : 
    1301              :   /* If the destination is memory, and we do not have matching source
    1302              :      operands, do things in registers.  */
    1303     13484290 :   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
    1304       482825 :     dst = gen_reg_rtx (mode);
    1305              : 
    1306              :   /* Source 1 cannot be a constant.  */
    1307     13484290 :   if (CONSTANT_P (src1))
    1308          713 :     src1 = force_reg (mode, src1);
    1309              : 
    1310              :   /* Source 1 cannot be a non-matching memory.  */
    1311     13484290 :   if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
    1312       438150 :     src1 = force_reg (mode, src1);
    1313              : 
    1314              :   /* Improve address combine.  */
    1315     13484290 :   if (code == PLUS
    1316      9932506 :       && GET_MODE_CLASS (mode) == MODE_INT
    1317      9821853 :       && MEM_P (src2))
    1318       176183 :     src2 = force_reg (mode, src2);
    1319              : 
    1320     13484290 :   operands[1] = src1;
    1321     13484290 :   operands[2] = src2;
    1322     13484290 :   return dst;
    1323              : }
    1324              : 
    1325              : /* Similarly, but assume that the destination has already been
    1326              :    set up properly.  */
    1327              : 
    1328              : void
    1329       290591 : ix86_fixup_binary_operands_no_copy (enum rtx_code code,
    1330              :                                     machine_mode mode, rtx operands[],
    1331              :                                     bool use_ndd)
    1332              : {
    1333       290591 :   rtx dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
    1334       290591 :   gcc_assert (dst == operands[0]);
    1335       290591 : }
    1336              : 
    1337              : /* Attempt to expand a binary operator.  Make the expansion closer to the
    1338              :    actual machine, then just general_operand, which will allow 3 separate
    1339              :    memory references (one output, two input) in a single insn.  */
    1340              : 
    1341              : void
    1342     13193570 : ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
    1343              :                              rtx operands[], bool use_ndd)
    1344              : {
    1345     13193570 :   rtx src1, src2, dst, op, clob;
    1346              : 
    1347     13193570 :   dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
    1348     13193570 :   src1 = operands[1];
    1349     13193570 :   src2 = operands[2];
    1350              : 
    1351              :  /* Emit the instruction.  */
    1352              : 
    1353     13193570 :   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
    1354              : 
    1355     13193570 :   if (reload_completed
    1356        80854 :       && code == PLUS
    1357          904 :       && !rtx_equal_p (dst, src1)
    1358     13193570 :       && !use_ndd)
    1359              :     {
    1360              :       /* This is going to be an LEA; avoid splitting it later.  */
    1361            0 :       emit_insn (op);
    1362              :     }
    1363              :   else
    1364              :     {
    1365     13193570 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1366     13193570 :       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1367              :     }
    1368              : 
    1369              :   /* Fix up the destination if needed.  */
    1370     13193570 :   if (dst != operands[0])
    1371       482816 :     emit_move_insn (operands[0], dst);
    1372     13193570 : }
    1373              : 
    1374              : /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
    1375              :    the given OPERANDS.  */
    1376              : 
    1377              : void
    1378        83572 : ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
    1379              :                                      rtx operands[])
    1380              : {
    1381        83572 :   rtx op1 = NULL_RTX, op2 = NULL_RTX;
    1382        83572 :   if (SUBREG_P (operands[1]))
    1383              :     {
    1384          312 :       op1 = operands[1];
    1385          312 :       op2 = operands[2];
    1386              :     }
    1387        83260 :   else if (SUBREG_P (operands[2]))
    1388              :     {
    1389              :       op1 = operands[2];
    1390              :       op2 = operands[1];
    1391              :     }
    1392              :   /* Optimize (__m128i) d | (__m128i) e and similar code
    1393              :      when d and e are float vectors into float vector logical
    1394              :      insn.  In C/C++ without using intrinsics there is no other way
    1395              :      to express vector logical operation on float vectors than
    1396              :      to cast them temporarily to integer vectors.  */
    1397         3145 :   if (op1
    1398         3145 :       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
    1399         3145 :       && (SUBREG_P (op2) || CONST_VECTOR_P (op2))
    1400          298 :       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
    1401          303 :       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
    1402          101 :       && SUBREG_BYTE (op1) == 0
    1403          101 :       && (CONST_VECTOR_P (op2)
    1404            1 :           || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
    1405            1 :               && SUBREG_BYTE (op2) == 0))
    1406          101 :       && can_create_pseudo_p ())
    1407              :     {
    1408          101 :       rtx dst;
    1409          101 :       switch (GET_MODE (SUBREG_REG (op1)))
    1410              :         {
    1411           17 :         case E_V4SFmode:
    1412           17 :         case E_V8SFmode:
    1413           17 :         case E_V16SFmode:
    1414           17 :         case E_V2DFmode:
    1415           17 :         case E_V4DFmode:
    1416           17 :         case E_V8DFmode:
    1417           17 :           dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
    1418           17 :           if (CONST_VECTOR_P (op2))
    1419              :             {
    1420           16 :               op2 = gen_lowpart (GET_MODE (dst), op2);
    1421           16 :               op2 = force_reg (GET_MODE (dst), op2);
    1422              :             }
    1423              :           else
    1424              :             {
    1425            1 :               op1 = operands[1];
    1426            1 :               op2 = SUBREG_REG (operands[2]);
    1427            1 :               if (!vector_operand (op2, GET_MODE (dst)))
    1428            0 :                 op2 = force_reg (GET_MODE (dst), op2);
    1429              :             }
    1430           17 :           op1 = SUBREG_REG (op1);
    1431           17 :           if (!vector_operand (op1, GET_MODE (dst)))
    1432            0 :             op1 = force_reg (GET_MODE (dst), op1);
    1433           17 :           emit_insn (gen_rtx_SET (dst,
    1434              :                                   gen_rtx_fmt_ee (code, GET_MODE (dst),
    1435              :                                                   op1, op2)));
    1436           17 :           emit_move_insn (operands[0], gen_lowpart (mode, dst));
    1437           17 :           return;
    1438              :         default:
    1439              :           break;
    1440              :         }
    1441              :     }
    1442        83555 :   if (!vector_operand (operands[1], mode))
    1443            0 :     operands[1] = force_reg (mode, operands[1]);
    1444        83555 :   if (!vector_operand (operands[2], mode))
    1445        11332 :     operands[2] = force_reg (mode, operands[2]);
    1446        83555 :   ix86_fixup_binary_operands_no_copy (code, mode, operands);
    1447        83555 :   emit_insn (gen_rtx_SET (operands[0],
    1448              :                           gen_rtx_fmt_ee (code, mode, operands[1],
    1449              :                                           operands[2])));
    1450              : }
    1451              : 
    1452              : /* Return TRUE or FALSE depending on whether the binary operator meets the
    1453              :    appropriate constraints.  */
    1454              : 
    1455              : bool
    1456    161062737 : ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
    1457              :                          rtx operands[3], bool use_ndd)
    1458              : {
    1459    161062737 :   rtx dst = operands[0];
    1460    161062737 :   rtx src1 = operands[1];
    1461    161062737 :   rtx src2 = operands[2];
    1462              : 
    1463              :   /* Both source operands cannot be in memory.  */
    1464    153716375 :   if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
    1465    161063122 :       && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
    1466       985514 :     return false;
    1467              : 
    1468              :   /* Canonicalize operand order for commutative operators.  */
    1469    160077223 :   if (ix86_swap_binary_operands_p (code, mode, operands))
    1470       533055 :     std::swap (src1, src2);
    1471              : 
    1472              :   /* If the destination is memory, we must have a matching source operand.  */
    1473    160077223 :   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
    1474              :     return false;
    1475              : 
    1476              :   /* Source 1 cannot be a constant.  */
    1477    155052503 :   if (CONSTANT_P (src1))
    1478              :     return false;
    1479              : 
    1480              :   /* Source 1 cannot be a non-matching memory.  */
    1481    155049454 :   if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
    1482              :     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
    1483      4421760 :     return (code == AND
    1484       512177 :             && (mode == HImode
    1485       512177 :                 || mode == SImode
    1486       308983 :                 || (TARGET_64BIT && mode == DImode))
    1487      4720228 :             && satisfies_constraint_L (src2));
    1488              : 
    1489              :   return true;
    1490              : }
    1491              : 
    1492              : /* Attempt to expand a unary operator.  Make the expansion closer to the
    1493              :    actual machine, then just general_operand, which will allow 2 separate
    1494              :    memory references (one output, one input) in a single insn.  */
    1495              : 
    1496              : void
    1497       118904 : ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
    1498              :                             rtx operands[], bool use_ndd)
    1499              : {
    1500       118904 :   bool matching_memory = false;
    1501       118904 :   rtx src, dst, op, clob;
    1502              : 
    1503       118904 :   dst = operands[0];
    1504       118904 :   src = operands[1];
    1505              : 
    1506              :   /* If the destination is memory, and we do not have matching source
    1507              :      operands, do things in registers.  */
    1508       118904 :   if (MEM_P (dst))
    1509              :     {
    1510         3225 :       if (rtx_equal_p (dst, src))
    1511              :         matching_memory = true;
    1512              :       else
    1513         2910 :         dst = gen_reg_rtx (mode);
    1514              :     }
    1515              : 
    1516              :   /* When source operand is memory, destination must match.  */
    1517       118904 :   if (!use_ndd && MEM_P (src) && !matching_memory)
    1518         4684 :     src = force_reg (mode, src);
    1519              : 
    1520              :   /* Emit the instruction.  */
    1521              : 
    1522       118904 :   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
    1523              : 
    1524       118904 :   if (code == NOT)
    1525        68249 :     emit_insn (op);
    1526              :   else
    1527              :     {
    1528        50655 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1529        50655 :       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1530              :     }
    1531              : 
    1532              :   /* Fix up the destination if needed.  */
    1533       118904 :   if (dst != operands[0])
    1534         2910 :     emit_move_insn (operands[0], dst);
    1535       118904 : }
    1536              : 
    1537              : /* Return TRUE or FALSE depending on whether the unary operator meets the
    1538              :    appropriate constraints.  */
    1539              : 
    1540              : bool
    1541      1723352 : ix86_unary_operator_ok (enum rtx_code,
    1542              :                         machine_mode,
    1543              :                         rtx operands[2],
    1544              :                         bool use_ndd)
    1545              : {
    1546              :   /* If one of operands is memory, source and destination must match.  */
    1547      1723352 :   if ((MEM_P (operands[0])
    1548      1680131 :        || (!use_ndd && MEM_P (operands[1])))
    1549      1752365 :       && ! rtx_equal_p (operands[0], operands[1]))
    1550              :     return false;
    1551              :   return true;
    1552              : }
    1553              : 
    1554              : /* Predict just emitted jump instruction to be taken with probability PROB.  */
    1555              : 
    1556              : static void
    1557        70669 : predict_jump (int prob)
    1558              : {
    1559        70669 :   rtx_insn *insn = get_last_insn ();
    1560        70669 :   gcc_assert (JUMP_P (insn));
    1561        70669 :   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
    1562        70669 : }
    1563              : 
    1564              : /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
    1565              :    divisor are within the range [0-255].  */
    1566              : 
    1567              : void
    1568           27 : ix86_split_idivmod (machine_mode mode, rtx operands[],
    1569              :                     bool unsigned_p)
    1570              : {
    1571           27 :   rtx_code_label *end_label, *qimode_label;
    1572           27 :   rtx div, mod;
    1573           27 :   rtx_insn *insn;
    1574           27 :   rtx scratch, tmp0, tmp1, tmp2;
    1575           27 :   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
    1576              : 
    1577           27 :   operands[2] = force_reg (mode, operands[2]);
    1578           27 :   operands[3] = force_reg (mode, operands[3]);
    1579              : 
    1580           27 :   switch (mode)
    1581              :     {
    1582           20 :     case E_SImode:
    1583           20 :       if (GET_MODE (operands[0]) == SImode)
    1584              :         {
    1585           16 :           if (GET_MODE (operands[1]) == SImode)
    1586           14 :             gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
    1587              :           else
    1588            2 :             gen_divmod4_1
    1589            2 :               = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
    1590              :         }
    1591              :       else
    1592            4 :         gen_divmod4_1
    1593            4 :           = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
    1594              :       break;
    1595              : 
    1596            7 :     case E_DImode:
    1597            7 :       gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
    1598              :       break;
    1599              : 
    1600            0 :     default:
    1601            0 :       gcc_unreachable ();
    1602              :     }
    1603              : 
    1604           27 :   end_label = gen_label_rtx ();
    1605           27 :   qimode_label = gen_label_rtx ();
    1606              : 
    1607           27 :   scratch = gen_reg_rtx (mode);
    1608              : 
    1609              :   /* Use 8bit unsigned divimod if dividend and divisor are within
    1610              :      the range [0-255].  */
    1611           27 :   emit_move_insn (scratch, operands[2]);
    1612           27 :   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
    1613              :                                  scratch, 1, OPTAB_DIRECT);
    1614           27 :   emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
    1615           27 :   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
    1616           27 :   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
    1617           27 :   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
    1618              :                                gen_rtx_LABEL_REF (VOIDmode, qimode_label),
    1619              :                                pc_rtx);
    1620           27 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
    1621           27 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
    1622           27 :   JUMP_LABEL (insn) = qimode_label;
    1623              : 
    1624              :   /* Generate original signed/unsigned divimod.  */
    1625           27 :   emit_insn (gen_divmod4_1 (operands[0], operands[1],
    1626              :                             operands[2], operands[3]));
    1627              : 
    1628              :   /* Branch to the end.  */
    1629           27 :   emit_jump_insn (gen_jump (end_label));
    1630           27 :   emit_barrier ();
    1631              : 
    1632              :   /* Generate 8bit unsigned divide.  */
    1633           27 :   emit_label (qimode_label);
    1634              :   /* Don't use operands[0] for result of 8bit divide since not all
    1635              :      registers support QImode ZERO_EXTRACT.  */
    1636           27 :   tmp0 = lowpart_subreg (HImode, scratch, mode);
    1637           27 :   tmp1 = lowpart_subreg (HImode, operands[2], mode);
    1638           27 :   tmp2 = lowpart_subreg (QImode, operands[3], mode);
    1639           27 :   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
    1640              : 
    1641           27 :   if (unsigned_p)
    1642              :     {
    1643           12 :       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
    1644           12 :       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
    1645              :     }
    1646              :   else
    1647              :     {
    1648           15 :       div = gen_rtx_DIV (mode, operands[2], operands[3]);
    1649           15 :       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
    1650              :     }
    1651           27 :   if (mode == SImode)
    1652              :     {
    1653           20 :       if (GET_MODE (operands[0]) != SImode)
    1654            4 :         div = gen_rtx_ZERO_EXTEND (DImode, div);
    1655           20 :       if (GET_MODE (operands[1]) != SImode)
    1656            2 :         mod = gen_rtx_ZERO_EXTEND (DImode, mod);
    1657              :     }
    1658              : 
    1659              :   /* Extract remainder from AH.  */
    1660           27 :   scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
    1661           27 :   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
    1662              :                                GEN_INT (8), GEN_INT (8));
    1663           27 :   insn = emit_move_insn (operands[1], tmp1);
    1664           27 :   set_unique_reg_note (insn, REG_EQUAL, mod);
    1665              : 
    1666              :   /* Zero extend quotient from AL.  */
    1667           27 :   tmp1 = gen_lowpart (QImode, tmp0);
    1668           27 :   insn = emit_insn (gen_extend_insn
    1669           27 :                     (operands[0], tmp1,
    1670           27 :                      GET_MODE (operands[0]), QImode, 1));
    1671           27 :   set_unique_reg_note (insn, REG_EQUAL, div);
    1672              : 
    1673           27 :   emit_label (end_label);
    1674           27 : }
    1675              : 
    1676              : /* Emit x86 binary operand CODE in mode MODE, where the first operand
    1677              :    matches destination.  RTX includes clobber of FLAGS_REG.  */
    1678              : 
    1679              : void
    1680         7832 : ix86_emit_binop (enum rtx_code code, machine_mode mode,
    1681              :                  rtx dst, rtx src)
    1682              : {
    1683         7832 :   rtx op, clob;
    1684              : 
    1685         7832 :   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
    1686         7832 :   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1687              : 
    1688         7832 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1689         7832 : }
    1690              : 
    1691              : /* Return true if regno1 def is nearest to the insn.  */
    1692              : 
    1693              : static bool
    1694           15 : find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
    1695              : {
    1696           15 :   rtx_insn *prev = insn;
    1697           15 :   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
    1698              : 
    1699           15 :   if (insn == start)
    1700              :     return false;
    1701           40 :   while (prev && prev != start)
    1702              :     {
    1703           30 :       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
    1704              :         {
    1705           10 :           prev = PREV_INSN (prev);
    1706           10 :           continue;
    1707              :         }
    1708           20 :       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
    1709              :         return true;
    1710           15 :       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
    1711              :         return false;
    1712           15 :       prev = PREV_INSN (prev);
    1713              :     }
    1714              : 
    1715              :   /* None of the regs is defined in the bb.  */
    1716              :   return false;
    1717              : }
    1718              : 
    1719              : /* INSN_UID of the last insn emitted by zero store peephole2s.  */
    1720              : int ix86_last_zero_store_uid;
    1721              : 
    1722              : /* Split lea instructions into a sequence of instructions
    1723              :    which are executed on ALU to avoid AGU stalls.
    1724              :    It is assumed that it is allowed to clobber flags register
    1725              :    at lea position.  */
    1726              : 
    1727              : void
    1728         6013 : ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
    1729              : {
    1730         6013 :   unsigned int regno0, regno1, regno2;
    1731         6013 :   struct ix86_address parts;
    1732         6013 :   rtx target, tmp;
    1733         6013 :   int ok, adds;
    1734              : 
    1735         6013 :   ok = ix86_decompose_address (operands[1], &parts);
    1736         6013 :   gcc_assert (ok);
    1737              : 
    1738         6013 :   target = gen_lowpart (mode, operands[0]);
    1739              : 
    1740         6013 :   regno0 = true_regnum (target);
    1741         6013 :   regno1 = INVALID_REGNUM;
    1742         6013 :   regno2 = INVALID_REGNUM;
    1743              : 
    1744         6013 :   if (parts.base)
    1745              :     {
    1746         6005 :       parts.base = gen_lowpart (mode, parts.base);
    1747         6005 :       regno1 = true_regnum (parts.base);
    1748              :     }
    1749              : 
    1750         6013 :   if (parts.index)
    1751              :     {
    1752         6010 :       parts.index = gen_lowpart (mode, parts.index);
    1753         6010 :       regno2 = true_regnum (parts.index);
    1754              :     }
    1755              : 
    1756         6013 :   if (parts.disp)
    1757          173 :     parts.disp = gen_lowpart (mode, parts.disp);
    1758              : 
    1759         6013 :   if (parts.scale > 1)
    1760              :     {
    1761              :       /* Case r1 = r1 + ...  */
    1762           11 :       if (regno1 == regno0)
    1763              :         {
    1764              :           /* If we have a case r1 = r1 + C * r2 then we
    1765              :              should use multiplication which is very
    1766              :              expensive.  Assume cost model is wrong if we
    1767              :              have such case here.  */
    1768            0 :           gcc_assert (regno2 != regno0);
    1769              : 
    1770            0 :           for (adds = parts.scale; adds > 0; adds--)
    1771            0 :             ix86_emit_binop (PLUS, mode, target, parts.index);
    1772              :         }
    1773              :       else
    1774              :         {
    1775              :           /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
    1776           11 :           if (regno0 != regno2)
    1777            8 :             emit_insn (gen_rtx_SET (target, parts.index));
    1778              : 
    1779              :           /* Use shift for scaling, but emit it as MULT instead
    1780              :              to avoid it being immediately peephole2 optimized back
    1781              :              into lea.  */
    1782           11 :           ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
    1783              : 
    1784           11 :           if (parts.base)
    1785            3 :             ix86_emit_binop (PLUS, mode, target, parts.base);
    1786              : 
    1787           11 :           if (parts.disp && parts.disp != const0_rtx)
    1788            3 :             ix86_emit_binop (PLUS, mode, target, parts.disp);
    1789              :         }
    1790              :     }
    1791         6002 :   else if (!parts.base && !parts.index)
    1792              :     {
    1793            0 :       gcc_assert(parts.disp);
    1794            0 :       emit_insn (gen_rtx_SET (target, parts.disp));
    1795              :     }
    1796              :   else
    1797              :     {
    1798         6002 :       if (!parts.base)
    1799              :         {
    1800            0 :           if (regno0 != regno2)
    1801            0 :             emit_insn (gen_rtx_SET (target, parts.index));
    1802              :         }
    1803         6002 :       else if (!parts.index)
    1804              :         {
    1805            3 :           if (regno0 != regno1)
    1806            1 :             emit_insn (gen_rtx_SET (target, parts.base));
    1807              :         }
    1808              :       else
    1809              :         {
    1810         5999 :           if (regno0 == regno1)
    1811              :             tmp = parts.index;
    1812         3111 :           else if (regno0 == regno2)
    1813              :             tmp = parts.base;
    1814              :           else
    1815              :             {
    1816           15 :               rtx tmp1;
    1817              : 
    1818              :               /* Find better operand for SET instruction, depending
    1819              :                  on which definition is farther from the insn.  */
    1820           15 :               if (find_nearest_reg_def (insn, regno1, regno2))
    1821            5 :                 tmp = parts.index, tmp1 = parts.base;
    1822              :               else
    1823           10 :                 tmp = parts.base, tmp1 = parts.index;
    1824              : 
    1825           15 :               emit_insn (gen_rtx_SET (target, tmp));
    1826              : 
    1827           15 :               if (parts.disp && parts.disp != const0_rtx)
    1828            0 :                 ix86_emit_binop (PLUS, mode, target, parts.disp);
    1829              : 
    1830           15 :               ix86_emit_binop (PLUS, mode, target, tmp1);
    1831           15 :               return;
    1832              :             }
    1833              : 
    1834         5984 :           ix86_emit_binop (PLUS, mode, target, tmp);
    1835              :         }
    1836              : 
    1837         5987 :       if (parts.disp && parts.disp != const0_rtx)
    1838            4 :         ix86_emit_binop (PLUS, mode, target, parts.disp);
    1839              :     }
    1840              : }
    1841              : 
    1842              : /* Post-reload splitter for converting an SF or DFmode value in an
    1843              :    SSE register into an unsigned SImode.  */
    1844              : 
    1845              : void
    1846            0 : ix86_split_convert_uns_si_sse (rtx operands[])
    1847              : {
    1848            0 :   machine_mode vecmode;
    1849            0 :   rtx value, large, zero_or_two31, input, two31, x;
    1850              : 
    1851            0 :   large = operands[1];
    1852            0 :   zero_or_two31 = operands[2];
    1853            0 :   input = operands[3];
    1854            0 :   two31 = operands[4];
    1855            0 :   vecmode = GET_MODE (large);
    1856            0 :   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
    1857              : 
    1858              :   /* Load up the value into the low element.  We must ensure that the other
    1859              :      elements are valid floats -- zero is the easiest such value.  */
    1860            0 :   if (MEM_P (input))
    1861              :     {
    1862            0 :       if (vecmode == V4SFmode)
    1863            0 :         emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
    1864              :       else
    1865            0 :         emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
    1866              :     }
    1867              :   else
    1868              :     {
    1869            0 :       input = gen_rtx_REG (vecmode, REGNO (input));
    1870            0 :       emit_move_insn (value, CONST0_RTX (vecmode));
    1871            0 :       if (vecmode == V4SFmode)
    1872            0 :         emit_insn (gen_sse_movss_v4sf (value, value, input));
    1873              :       else
    1874            0 :         emit_insn (gen_sse2_movsd_v2df (value, value, input));
    1875              :     }
    1876              : 
    1877            0 :   emit_move_insn (large, two31);
    1878            0 :   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
    1879              : 
    1880            0 :   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
    1881            0 :   emit_insn (gen_rtx_SET (large, x));
    1882              : 
    1883            0 :   x = gen_rtx_AND (vecmode, zero_or_two31, large);
    1884            0 :   emit_insn (gen_rtx_SET (zero_or_two31, x));
    1885              : 
    1886            0 :   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
    1887            0 :   emit_insn (gen_rtx_SET (value, x));
    1888              : 
    1889            0 :   large = gen_rtx_REG (V4SImode, REGNO (large));
    1890            0 :   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
    1891              : 
    1892            0 :   x = gen_rtx_REG (V4SImode, REGNO (value));
    1893            0 :   if (vecmode == V4SFmode)
    1894            0 :     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
    1895              :   else
    1896            0 :     emit_insn (gen_sse2_cvttpd2dq (x, value));
    1897            0 :   value = x;
    1898              : 
    1899            0 :   emit_insn (gen_xorv4si3 (value, value, large));
    1900            0 : }
    1901              : 
    1902              : /* Convert an unsigned DImode value into a DFmode, using only SSE.
    1903              :    Expects the 64-bit DImode to be supplied in a pair of integral
    1904              :    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
    1905              :    -mfpmath=sse, !optimize_size only.  */
    1906              : 
    1907              : void
    1908            0 : ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
    1909              : {
    1910            0 :   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
    1911            0 :   rtx int_xmm, fp_xmm;
    1912            0 :   rtx biases, exponents;
    1913            0 :   rtx x;
    1914              : 
    1915            0 :   int_xmm = gen_reg_rtx (V4SImode);
    1916            0 :   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
    1917            0 :     emit_insn (gen_movdi_to_sse (int_xmm, input));
    1918            0 :   else if (TARGET_SSE_SPLIT_REGS)
    1919              :     {
    1920            0 :       emit_clobber (int_xmm);
    1921            0 :       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
    1922              :     }
    1923              :   else
    1924              :     {
    1925            0 :       x = gen_reg_rtx (V2DImode);
    1926            0 :       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
    1927            0 :       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
    1928              :     }
    1929              : 
    1930            0 :   x = gen_rtx_CONST_VECTOR (V4SImode,
    1931              :                             gen_rtvec (4, GEN_INT (0x43300000UL),
    1932              :                                        GEN_INT (0x45300000UL),
    1933              :                                        const0_rtx, const0_rtx));
    1934            0 :   exponents = validize_mem (force_const_mem (V4SImode, x));
    1935              : 
    1936              :   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
    1937            0 :   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
    1938              : 
    1939              :   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
    1940              :      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
    1941              :      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
    1942              :      (0x1.0p84 + double(fp_value_hi_xmm)).
    1943              :      Note these exponents differ by 32.  */
    1944              : 
    1945            0 :   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
    1946              : 
    1947              :   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
    1948              :      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
    1949            0 :   real_ldexp (&bias_lo_rvt, &dconst1, 52);
    1950            0 :   real_ldexp (&bias_hi_rvt, &dconst1, 84);
    1951            0 :   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
    1952            0 :   x = const_double_from_real_value (bias_hi_rvt, DFmode);
    1953            0 :   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
    1954            0 :   biases = validize_mem (force_const_mem (V2DFmode, biases));
    1955            0 :   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
    1956              : 
    1957              :   /* Add the upper and lower DFmode values together.  */
    1958            0 :   if (TARGET_SSE3)
    1959            0 :     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
    1960              :   else
    1961              :     {
    1962            0 :       x = copy_to_mode_reg (V2DFmode, fp_xmm);
    1963            0 :       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
    1964            0 :       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
    1965              :     }
    1966              : 
    1967            0 :   ix86_expand_vector_extract (false, target, fp_xmm, 0);
    1968            0 : }
    1969              : 
    1970              : /* Not used, but eases macroization of patterns.  */
    1971              : void
    1972            0 : ix86_expand_convert_uns_sixf_sse (rtx, rtx)
    1973              : {
    1974            0 :   gcc_unreachable ();
    1975              : }
    1976              : 
    1977              : static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
    1978              : 
    1979              : /* Convert an unsigned SImode value into a DFmode.  Only currently used
    1980              :    for SSE, but applicable anywhere.  */
    1981              : 
    1982              : void
    1983            0 : ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
    1984              : {
    1985            0 :   REAL_VALUE_TYPE TWO31r;
    1986            0 :   rtx x, fp;
    1987              : 
    1988            0 :   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
    1989              :                            NULL, 1, OPTAB_DIRECT);
    1990              : 
    1991            0 :   fp = gen_reg_rtx (DFmode);
    1992            0 :   emit_insn (gen_floatsidf2 (fp, x));
    1993              : 
    1994            0 :   real_ldexp (&TWO31r, &dconst1, 31);
    1995            0 :   x = const_double_from_real_value (TWO31r, DFmode);
    1996              : 
    1997            0 :   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
    1998              : 
    1999              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
    2000            0 :   if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
    2001            0 :     x = ix86_expand_sse_fabs (x, NULL);
    2002              : 
    2003            0 :   if (x != target)
    2004            0 :     emit_move_insn (target, x);
    2005            0 : }
    2006              : 
    2007              : /* Convert a signed DImode value into a DFmode.  Only used for SSE in
    2008              :    32-bit mode; otherwise we have a direct convert instruction.  */
    2009              : 
    2010              : void
    2011            0 : ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
    2012              : {
    2013            0 :   REAL_VALUE_TYPE TWO32r;
    2014            0 :   rtx fp_lo, fp_hi, x;
    2015              : 
    2016            0 :   fp_lo = gen_reg_rtx (DFmode);
    2017            0 :   fp_hi = gen_reg_rtx (DFmode);
    2018              : 
    2019            0 :   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
    2020              : 
    2021            0 :   real_ldexp (&TWO32r, &dconst1, 32);
    2022            0 :   x = const_double_from_real_value (TWO32r, DFmode);
    2023            0 :   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
    2024              : 
    2025            0 :   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
    2026              : 
    2027            0 :   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
    2028              :                            0, OPTAB_DIRECT);
    2029            0 :   if (x != target)
    2030            0 :     emit_move_insn (target, x);
    2031            0 : }
    2032              : 
    2033              : /* Convert an unsigned SImode value into a SFmode, using only SSE.
    2034              :    For x86_32, -mfpmath=sse, !optimize_size only.  */
    2035              : void
    2036            0 : ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
    2037              : {
    2038            0 :   REAL_VALUE_TYPE ONE16r;
    2039            0 :   rtx fp_hi, fp_lo, int_hi, int_lo, x;
    2040              : 
    2041            0 :   real_ldexp (&ONE16r, &dconst1, 16);
    2042            0 :   x = const_double_from_real_value (ONE16r, SFmode);
    2043            0 :   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
    2044              :                                       NULL, 0, OPTAB_DIRECT);
    2045            0 :   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
    2046              :                                       NULL, 0, OPTAB_DIRECT);
    2047            0 :   fp_hi = gen_reg_rtx (SFmode);
    2048            0 :   fp_lo = gen_reg_rtx (SFmode);
    2049            0 :   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
    2050            0 :   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
    2051            0 :   if (TARGET_FMA)
    2052              :     {
    2053            0 :       x = validize_mem (force_const_mem (SFmode, x));
    2054            0 :       fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
    2055            0 :       emit_move_insn (target, fp_hi);
    2056              :     }
    2057              :   else
    2058              :     {
    2059            0 :       fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
    2060              :                                    0, OPTAB_DIRECT);
    2061            0 :       fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
    2062              :                                    0, OPTAB_DIRECT);
    2063            0 :       if (!rtx_equal_p (target, fp_hi))
    2064            0 :         emit_move_insn (target, fp_hi);
    2065              :     }
    2066            0 : }
    2067              : 
    2068              : /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
    2069              :    a vector of unsigned ints VAL to vector of floats TARGET.  */
    2070              : 
    2071              : void
    2072           54 : ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
    2073              : {
    2074           54 :   rtx tmp[8];
    2075           54 :   REAL_VALUE_TYPE TWO16r;
    2076           54 :   machine_mode intmode = GET_MODE (val);
    2077           54 :   machine_mode fltmode = GET_MODE (target);
    2078           54 :   rtx (*cvt) (rtx, rtx);
    2079              : 
    2080           54 :   if (intmode == V4SImode)
    2081              :     cvt = gen_floatv4siv4sf2;
    2082              :   else
    2083            2 :     cvt = gen_floatv8siv8sf2;
    2084           54 :   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
    2085           54 :   tmp[0] = force_reg (intmode, tmp[0]);
    2086           54 :   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
    2087              :                                 OPTAB_DIRECT);
    2088           54 :   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
    2089              :                                 NULL_RTX, 1, OPTAB_DIRECT);
    2090           54 :   tmp[3] = gen_reg_rtx (fltmode);
    2091           54 :   emit_insn (cvt (tmp[3], tmp[1]));
    2092           54 :   tmp[4] = gen_reg_rtx (fltmode);
    2093           54 :   emit_insn (cvt (tmp[4], tmp[2]));
    2094           54 :   real_ldexp (&TWO16r, &dconst1, 16);
    2095           54 :   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
    2096           54 :   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
    2097           54 :   if (TARGET_FMA)
    2098              :     {
    2099            1 :       tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
    2100            1 :       emit_move_insn (target, tmp[6]);
    2101              :     }
    2102              :   else
    2103              :     {
    2104           53 :       tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
    2105              :                                     NULL_RTX, 1, OPTAB_DIRECT);
    2106           53 :       tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
    2107              :                                     target, 1, OPTAB_DIRECT);
    2108           53 :       if (tmp[7] != target)
    2109            0 :         emit_move_insn (target, tmp[7]);
    2110              :     }
    2111           54 : }
    2112              : 
    2113              : /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
    2114              :    pattern can be used on it instead of fixuns_trunc*.
    2115              :    This is done by doing just signed conversion if < 0x1p31, and otherwise by
    2116              :    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
    2117              : 
    2118              : rtx
    2119          286 : ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
    2120              : {
    2121          286 :   REAL_VALUE_TYPE TWO31r;
    2122          286 :   rtx two31r, tmp[4];
    2123          286 :   machine_mode mode = GET_MODE (val);
    2124          286 :   machine_mode scalarmode = GET_MODE_INNER (mode);
    2125          572 :   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
    2126          286 :   rtx (*cmp) (rtx, rtx, rtx, rtx);
    2127          286 :   int i;
    2128              : 
    2129         1144 :   for (i = 0; i < 3; i++)
    2130          858 :     tmp[i] = gen_reg_rtx (mode);
    2131          286 :   real_ldexp (&TWO31r, &dconst1, 31);
    2132          286 :   two31r = const_double_from_real_value (TWO31r, scalarmode);
    2133          286 :   two31r = ix86_build_const_vector (mode, 1, two31r);
    2134          286 :   two31r = force_reg (mode, two31r);
    2135          286 :   switch (mode)
    2136              :     {
    2137              :     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
    2138           10 :     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
    2139           16 :     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
    2140          260 :     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
    2141            0 :     default: gcc_unreachable ();
    2142              :     }
    2143          286 :   tmp[3] = gen_rtx_LE (mode, two31r, val);
    2144          286 :   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
    2145          286 :   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
    2146              :                                 0, OPTAB_DIRECT);
    2147          286 :   if (intmode == V4SImode || TARGET_AVX2)
    2148          572 :     *xorp = expand_simple_binop (intmode, ASHIFT,
    2149          286 :                                  gen_lowpart (intmode, tmp[0]),
    2150              :                                  GEN_INT (31), NULL_RTX, 0,
    2151              :                                  OPTAB_DIRECT);
    2152              :   else
    2153              :     {
    2154            0 :       rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
    2155            0 :       two31 = ix86_build_const_vector (intmode, 1, two31);
    2156            0 :       *xorp = expand_simple_binop (intmode, AND,
    2157            0 :                                    gen_lowpart (intmode, tmp[0]),
    2158              :                                    two31, NULL_RTX, 0,
    2159              :                                    OPTAB_DIRECT);
    2160              :     }
    2161          286 :   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
    2162          286 :                               0, OPTAB_DIRECT);
    2163              : }
    2164              : 
    2165              : /* Generate code for floating point ABS or NEG.  */
    2166              : 
    2167              : void
    2168        32345 : ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
    2169              :                                 rtx operands[])
    2170              : {
    2171        32345 :   rtx set, dst, src;
    2172        32345 :   bool use_sse = false;
    2173        32345 :   bool vector_mode = VECTOR_MODE_P (mode);
    2174        32345 :   machine_mode vmode = mode;
    2175        32345 :   rtvec par;
    2176              : 
    2177        32345 :   switch (mode)
    2178              :   {
    2179              :   case E_HFmode:
    2180              :     use_sse = true;
    2181              :     vmode = V8HFmode;
    2182              :     break;
    2183            0 :   case E_BFmode:
    2184            0 :     use_sse = true;
    2185            0 :     vmode = V8BFmode;
    2186            0 :     break;
    2187         8693 :   case E_SFmode:
    2188         8693 :     use_sse = TARGET_SSE_MATH && TARGET_SSE;
    2189              :     vmode = V4SFmode;
    2190              :     break;
    2191        15163 :   case E_DFmode:
    2192        15163 :     use_sse = TARGET_SSE_MATH && TARGET_SSE2;
    2193              :     vmode = V2DFmode;
    2194              :     break;
    2195         8290 :   default:
    2196         8290 :     use_sse = vector_mode || mode == TFmode;
    2197         8290 :     break;
    2198              :   }
    2199              : 
    2200        32345 :   dst = operands[0];
    2201        32345 :   src = operands[1];
    2202              : 
    2203        32345 :   set = gen_rtx_fmt_e (code, mode, src);
    2204        32345 :   set = gen_rtx_SET (dst, set);
    2205              : 
    2206        32345 :   if (use_sse)
    2207              :     {
    2208        27142 :       rtx mask, use, clob;
    2209              : 
    2210              :       /* NEG and ABS performed with SSE use bitwise mask operations.
    2211              :          Create the appropriate mask now.  */
    2212        27142 :       mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
    2213        27142 :       use = gen_rtx_USE (VOIDmode, mask);
    2214        27142 :       if (vector_mode || mode == TFmode)
    2215         4404 :         par = gen_rtvec (2, set, use);
    2216              :       else
    2217              :         {
    2218        22738 :           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2219        22738 :           par = gen_rtvec (3, set, use, clob);
    2220              :         }
    2221              :     }
    2222              :   else
    2223              :     {
    2224         5203 :       rtx clob;
    2225              : 
    2226              :       /* Changing of sign for FP values is doable using integer unit too.  */
    2227         5203 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2228         5203 :       par = gen_rtvec (2, set, clob);
    2229              :     }
    2230              : 
    2231        32345 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
    2232        32345 : }
    2233              : 
    2234              : /* Deconstruct a floating point ABS or NEG operation
    2235              :    with integer registers into integer operations.  */
    2236              : 
    2237              : void
    2238           24 : ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
    2239              :                                rtx operands[])
    2240              : {
    2241           24 :   enum rtx_code absneg_op;
    2242           24 :   rtx dst, set;
    2243              : 
    2244           24 :   gcc_assert (operands_match_p (operands[0], operands[1]));
    2245              : 
    2246           24 :   switch (mode)
    2247              :     {
    2248            0 :     case E_SFmode:
    2249            0 :       dst = gen_lowpart (SImode, operands[0]);
    2250              : 
    2251            0 :       if (code == ABS)
    2252              :         {
    2253            0 :           set = gen_int_mode (0x7fffffff, SImode);
    2254            0 :           absneg_op = AND;
    2255              :         }
    2256              :       else
    2257              :         {
    2258            0 :           set = gen_int_mode (0x80000000, SImode);
    2259            0 :           absneg_op = XOR;
    2260              :         }
    2261            0 :       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2262            0 :       break;
    2263              : 
    2264            1 :     case E_DFmode:
    2265            1 :       if (TARGET_64BIT)
    2266              :         {
    2267            1 :           dst = gen_lowpart (DImode, operands[0]);
    2268            1 :           dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
    2269              : 
    2270            1 :           if (code == ABS)
    2271            0 :             set = const0_rtx;
    2272              :           else
    2273            1 :             set = gen_rtx_NOT (DImode, dst);
    2274              :         }
    2275              :       else
    2276              :         {
    2277            0 :           dst = gen_highpart (SImode, operands[0]);
    2278              : 
    2279            0 :           if (code == ABS)
    2280              :             {
    2281            0 :               set = gen_int_mode (0x7fffffff, SImode);
    2282            0 :               absneg_op = AND;
    2283              :             }
    2284              :           else
    2285              :             {
    2286            0 :               set = gen_int_mode (0x80000000, SImode);
    2287            0 :               absneg_op = XOR;
    2288              :             }
    2289            0 :           set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2290              :         }
    2291              :       break;
    2292              : 
    2293           23 :     case E_XFmode:
    2294           23 :       dst = gen_rtx_REG (SImode,
    2295           23 :                          REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
    2296           23 :       if (code == ABS)
    2297              :         {
    2298            1 :           set = GEN_INT (0x7fff);
    2299            1 :           absneg_op = AND;
    2300              :         }
    2301              :       else
    2302              :         {
    2303           22 :           set = GEN_INT (0x8000);
    2304           22 :           absneg_op = XOR;
    2305              :         }
    2306           23 :       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2307           23 :       break;
    2308              : 
    2309            0 :     default:
    2310            0 :       gcc_unreachable ();
    2311              :     }
    2312              : 
    2313           24 :   set = gen_rtx_SET (dst, set);
    2314              : 
    2315           24 :   rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2316           24 :   rtvec par = gen_rtvec (2, set, clob);
    2317              : 
    2318           24 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
    2319           24 : }
    2320              : 
    2321              : /* Expand a copysign operation.  Special case operand 0 being a constant.  */
    2322              : 
    2323              : void
    2324        23243 : ix86_expand_copysign (rtx operands[])
    2325              : {
    2326        23243 :   machine_mode mode, vmode;
    2327        23243 :   rtx dest, vdest, op0, op1, mask, op2, op3;
    2328              : 
    2329        23243 :   mode = GET_MODE (operands[0]);
    2330              : 
    2331        23243 :   switch (mode)
    2332              :   {
    2333              :   case E_HFmode:
    2334              :     vmode = V8HFmode;
    2335              :     break;
    2336            0 :   case E_BFmode:
    2337            0 :     vmode = V8BFmode;
    2338            0 :     break;
    2339        11566 :   case E_SFmode:
    2340        11566 :     vmode = V4SFmode;
    2341        11566 :     break;
    2342        11538 :   case E_DFmode:
    2343        11538 :     vmode = V2DFmode;
    2344        11538 :     break;
    2345          127 :   case E_TFmode:
    2346          127 :     vmode = mode;
    2347          127 :     break;
    2348            0 :   default:
    2349            0 :     gcc_unreachable();
    2350              :   }
    2351              : 
    2352        23243 :   if (rtx_equal_p (operands[1], operands[2]))
    2353              :     {
    2354            0 :       emit_move_insn (operands[0], operands[1]);
    2355            0 :       return;
    2356              :     }
    2357              : 
    2358        23243 :   dest = operands[0];
    2359        23243 :   vdest = lowpart_subreg (vmode, dest, mode);
    2360        23243 :   if (vdest == NULL_RTX)
    2361            0 :     vdest = gen_reg_rtx (vmode);
    2362              :   else
    2363              :     dest = NULL_RTX;
    2364        23243 :   op1 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
    2365        46472 :   mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
    2366              : 
    2367        23243 :   if (CONST_DOUBLE_P (operands[2]))
    2368              :     {
    2369           79 :       if (real_isneg (CONST_DOUBLE_REAL_VALUE (operands[2])))
    2370              :         /* Simplify b = copysign (a, negative) to b = mask | a.  */
    2371           76 :         op1 = gen_rtx_IOR (vmode, mask, op1);
    2372              :       else
    2373              :         {
    2374              :           /* Simplify b = copysign (a, positive) to b = invert_mask & a.  */
    2375            3 :           rtx invert_mask
    2376            3 :             = ix86_build_signbit_mask (vmode,
    2377            3 :                                        TARGET_AVX512F && mode != HFmode,
    2378              :                                        true);
    2379            3 :           op1 = gen_rtx_AND (vmode, invert_mask, op1);
    2380              :         }
    2381           79 :       emit_move_insn (vdest, op1);
    2382           79 :       if (dest)
    2383            0 :         emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2384           79 :       return;
    2385              :     }
    2386              :   else
    2387        23164 :     op0 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
    2388              : 
    2389        23164 :   op2 = gen_reg_rtx (vmode);
    2390        23164 :   op3 = gen_reg_rtx (vmode);
    2391        23164 :   rtx invert_mask;
    2392              :   /* NB: Generate vmovdqa, vpandn, vpand, vpor for AVX and generate pand,
    2393              :      pand, por for SSE.  */
    2394        23164 :   if (TARGET_AVX)
    2395           33 :     invert_mask = gen_rtx_NOT (vmode, mask);
    2396              :   else
    2397        23131 :     invert_mask = ix86_build_signbit_mask (vmode,
    2398        23131 :                                            TARGET_AVX512F && mode != HFmode,
    2399              :                                            true);
    2400        23164 :   emit_move_insn (op2, gen_rtx_AND (vmode, invert_mask, op1));
    2401        23164 :   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op0));
    2402        23164 :   emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
    2403        23164 :   if (dest)
    2404            0 :     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2405              : }
    2406              : 
    2407              : /* Expand an xorsign operation.  */
    2408              : 
    2409              : void
    2410           20 : ix86_expand_xorsign (rtx operands[])
    2411              : {
    2412           20 :   machine_mode mode, vmode;
    2413           20 :   rtx dest, vdest, op0, op1, mask, x, temp;
    2414              : 
    2415           20 :   dest = operands[0];
    2416           20 :   op0 = operands[1];
    2417           20 :   op1 = operands[2];
    2418              : 
    2419           20 :   mode = GET_MODE (dest);
    2420              : 
    2421           20 :   switch (mode)
    2422              :   {
    2423              :   case E_HFmode:
    2424              :     vmode = V8HFmode;
    2425              :     break;
    2426              :   case E_BFmode:
    2427              :     vmode = V8BFmode;
    2428              :     break;
    2429              :   case E_SFmode:
    2430              :     vmode = V4SFmode;
    2431              :     break;
    2432              :   case E_DFmode:
    2433              :     vmode = V2DFmode;
    2434              :     break;
    2435            0 :   default:
    2436            0 :     gcc_unreachable ();
    2437           20 :     break;
    2438              :   }
    2439              : 
    2440           20 :   temp = gen_reg_rtx (vmode);
    2441           20 :   mask = ix86_build_signbit_mask (vmode, 0, 0);
    2442              : 
    2443           20 :   op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
    2444           20 :   x = gen_rtx_AND (vmode, op1, mask);
    2445           20 :   emit_insn (gen_rtx_SET (temp, x));
    2446              : 
    2447           20 :   op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
    2448           20 :   x = gen_rtx_XOR (vmode, temp, op0);
    2449              : 
    2450           20 :   vdest = lowpart_subreg (vmode, dest, mode);
    2451           20 :   if (vdest == NULL_RTX)
    2452            0 :     vdest = gen_reg_rtx (vmode);
    2453              :   else
    2454              :     dest = NULL_RTX;
    2455           20 :   emit_insn (gen_rtx_SET (vdest, x));
    2456              : 
    2457           20 :   if (dest)
    2458            0 :     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2459           20 : }
    2460              : 
    2461              : static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
    2462              : 
    2463              : void
    2464      6639541 : ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
    2465              : {
    2466      6639541 :   machine_mode mode = GET_MODE (op0);
    2467      6639541 :   rtx tmp;
    2468              : 
    2469              :   /* Handle special case - vector comparsion with boolean result, transform
    2470              :      it using ptest instruction or vpcmpeq + kortest.  */
    2471      6639541 :   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
    2472      6620371 :       || (mode == TImode && !TARGET_64BIT)
    2473      6620371 :       || mode == OImode
    2474     13259912 :       || GET_MODE_SIZE (mode) == 64)
    2475              :     {
    2476        19170 :       unsigned msize = GET_MODE_SIZE (mode);
    2477        19170 :       machine_mode p_mode
    2478        19170 :         = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
    2479              :       /* kortest set CF when result is 0xFFFF (op0 == op1).  */
    2480        19170 :       rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
    2481              : 
    2482        19170 :       gcc_assert (code == EQ || code == NE);
    2483              : 
    2484              :       /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
    2485        19170 :       if (msize == 64)
    2486              :         {
    2487         2195 :           if (mode != V16SImode)
    2488              :             {
    2489         2195 :               op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
    2490         2195 :               op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
    2491              :             }
    2492              : 
    2493         2195 :           tmp = gen_reg_rtx (HImode);
    2494         2195 :           emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
    2495         2195 :           emit_insn (gen_kortesthi_ccc (tmp, tmp));
    2496              :         }
    2497              :       /* Using ptest for 128/256-bit vectors.  */
    2498              :       else
    2499              :         {
    2500        16975 :           if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
    2501              :             {
    2502            0 :               op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
    2503            0 :               op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
    2504            0 :               mode = p_mode;
    2505              :             }
    2506              : 
    2507              :           /* Generate XOR since we can't check that one operand is zero
    2508              :              vector.  */
    2509        16975 :           tmp = gen_reg_rtx (mode);
    2510        16975 :           rtx ops[3] = { tmp, op0, op1 };
    2511        16975 :           ix86_expand_vector_logical_operator (XOR, mode, ops);
    2512        16975 :           tmp = gen_lowpart (p_mode, tmp);
    2513        16975 :           emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
    2514              :                                   gen_rtx_UNSPEC (CCZmode,
    2515              :                                                   gen_rtvec (2, tmp, tmp),
    2516              :                                                   UNSPEC_PTEST)));
    2517              :         }
    2518        19170 :       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
    2519        19170 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
    2520              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
    2521              :                                   pc_rtx);
    2522        19170 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    2523        19170 :       return;
    2524              :     }
    2525              : 
    2526      6620371 :   switch (mode)
    2527              :     {
    2528      6589652 :     case E_HFmode:
    2529      6589652 :     case E_SFmode:
    2530      6589652 :     case E_DFmode:
    2531      6589652 :     case E_XFmode:
    2532      6589652 :     case E_QImode:
    2533      6589652 :     case E_HImode:
    2534      6589652 :     case E_SImode:
    2535      6589652 :       simple:
    2536      6589652 :       tmp = ix86_expand_compare (code, op0, op1);
    2537      6589652 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
    2538              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
    2539              :                                   pc_rtx);
    2540      6589652 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    2541      6589652 :       return;
    2542              : 
    2543            7 :     case E_BFmode:
    2544            7 :       gcc_assert (TARGET_AVX10_2 && !flag_trapping_math);
    2545            7 :       goto simple;
    2546              : 
    2547      2674289 :     case E_DImode:
    2548      2674289 :       if (TARGET_64BIT)
    2549      2645631 :         goto simple;
    2550              :       /* FALLTHRU */
    2551        88567 :     case E_TImode:
    2552              :       /* DI and TI mode equality/inequality comparisons may be performed
    2553              :          on SSE registers.  Avoid splitting them, except when optimizing
    2554              :          for size.  */
    2555        88567 :       if ((code == EQ || code == NE)
    2556        88567 :           && !optimize_insn_for_size_p ())
    2557        57848 :         goto simple;
    2558              : 
    2559              :       /* Expand DImode branch into multiple compare+branch.  */
    2560        30719 :       {
    2561        30719 :         rtx lo[2], hi[2];
    2562        30719 :         rtx_code_label *label2;
    2563        30719 :         enum rtx_code code1, code2, code3;
    2564        30719 :         machine_mode submode;
    2565              : 
    2566        30719 :         if (CONSTANT_P (op0) && !CONSTANT_P (op1))
    2567              :           {
    2568            0 :             std::swap (op0, op1);
    2569            0 :             code = swap_condition (code);
    2570              :           }
    2571              : 
    2572        30719 :         split_double_mode (mode, &op0, 1, lo+0, hi+0);
    2573        30719 :         split_double_mode (mode, &op1, 1, lo+1, hi+1);
    2574              : 
    2575        30719 :         submode = mode == DImode ? SImode : DImode;
    2576              : 
    2577              :         /* If we are doing less-than or greater-or-equal-than,
    2578              :            op1 is a constant and the low word is zero, then we can just
    2579              :            examine the high word.  Similarly for low word -1 and
    2580              :            less-or-equal-than or greater-than.  */
    2581              : 
    2582        30719 :         if (CONST_INT_P (hi[1]))
    2583        19777 :           switch (code)
    2584              :             {
    2585        10560 :             case LT: case LTU: case GE: case GEU:
    2586        10560 :               if (lo[1] == const0_rtx)
    2587              :                 {
    2588        10151 :                   ix86_expand_branch (code, hi[0], hi[1], label);
    2589        10151 :                   return;
    2590              :                 }
    2591              :               break;
    2592         7644 :             case LE: case LEU: case GT: case GTU:
    2593         7644 :               if (lo[1] == constm1_rtx)
    2594              :                 {
    2595          530 :                   ix86_expand_branch (code, hi[0], hi[1], label);
    2596          530 :                   return;
    2597              :                 }
    2598              :               break;
    2599              :             default:
    2600              :               break;
    2601              :             }
    2602              : 
    2603              :         /* Emulate comparisons that do not depend on Zero flag with
    2604              :            double-word subtraction.  Note that only Overflow, Sign
    2605              :            and Carry flags are valid, so swap arguments and condition
    2606              :            of comparisons that would otherwise test Zero flag.  */
    2607              : 
    2608        20038 :         switch (code)
    2609              :           {
    2610        12570 :           case LE: case LEU: case GT: case GTU:
    2611        12570 :             std::swap (lo[0], lo[1]);
    2612        12570 :             std::swap (hi[0], hi[1]);
    2613        12570 :             code = swap_condition (code);
    2614              :             /* FALLTHRU */
    2615              : 
    2616        16972 :           case LT: case LTU: case GE: case GEU:
    2617        16972 :             {
    2618        16972 :               bool uns = (code == LTU || code == GEU);
    2619         3989 :               rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
    2620        16972 :                 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
    2621              : 
    2622        16972 :               if (!nonimmediate_operand (lo[0], submode))
    2623         7114 :                 lo[0] = force_reg (submode, lo[0]);
    2624        16972 :               if (!x86_64_general_operand (lo[1], submode))
    2625            0 :                 lo[1] = force_reg (submode, lo[1]);
    2626              : 
    2627        16972 :               if (!register_operand (hi[0], submode))
    2628         7936 :                 hi[0] = force_reg (submode, hi[0]);
    2629        12983 :               if ((uns && !nonimmediate_operand (hi[1], submode))
    2630        16972 :                   || (!uns && !x86_64_general_operand (hi[1], submode)))
    2631          315 :                 hi[1] = force_reg (submode, hi[1]);
    2632              : 
    2633        16972 :               emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
    2634              : 
    2635        16972 :               tmp = gen_rtx_SCRATCH (submode);
    2636        16972 :               emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
    2637              : 
    2638        20961 :               tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
    2639        16972 :               ix86_expand_branch (code, tmp, const0_rtx, label);
    2640        16972 :               return;
    2641              :             }
    2642              : 
    2643         3066 :           default:
    2644         3066 :             break;
    2645              :           }
    2646              : 
    2647              :         /* Otherwise, we need two or three jumps.  */
    2648              : 
    2649         3066 :         label2 = gen_label_rtx ();
    2650              : 
    2651         3066 :         code1 = code;
    2652         3066 :         code2 = swap_condition (code);
    2653         3066 :         code3 = unsigned_condition (code);
    2654              : 
    2655         3066 :         switch (code)
    2656              :           {
    2657              :           case LT: case GT: case LTU: case GTU:
    2658              :             break;
    2659              : 
    2660              :           case LE:   code1 = LT;  code2 = GT;  break;
    2661              :           case GE:   code1 = GT;  code2 = LT;  break;
    2662            0 :           case LEU:  code1 = LTU; code2 = GTU; break;
    2663            0 :           case GEU:  code1 = GTU; code2 = LTU; break;
    2664              : 
    2665              :           case EQ:   code1 = UNKNOWN; code2 = NE;  break;
    2666              :           case NE:   code2 = UNKNOWN; break;
    2667              : 
    2668            0 :           default:
    2669            0 :             gcc_unreachable ();
    2670              :           }
    2671              : 
    2672              :         /*
    2673              :          * a < b =>
    2674              :          *    if (hi(a) < hi(b)) goto true;
    2675              :          *    if (hi(a) > hi(b)) goto false;
    2676              :          *    if (lo(a) < lo(b)) goto true;
    2677              :          *  false:
    2678              :          */
    2679              : 
    2680            0 :         if (code1 != UNKNOWN)
    2681         2328 :           ix86_expand_branch (code1, hi[0], hi[1], label);
    2682         3066 :         if (code2 != UNKNOWN)
    2683          738 :           ix86_expand_branch (code2, hi[0], hi[1], label2);
    2684              : 
    2685         3066 :         ix86_expand_branch (code3, lo[0], lo[1], label);
    2686              : 
    2687         3066 :         if (code2 != UNKNOWN)
    2688          738 :           emit_label (label2);
    2689              :         return;
    2690              :       }
    2691              : 
    2692        17436 :     default:
    2693        17436 :       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
    2694        17436 :       goto simple;
    2695              :     }
    2696              : }
    2697              : 
    2698              : /* Figure out whether to use unordered fp comparisons.  */
    2699              : 
    2700              : static bool
    2701      1147075 : ix86_unordered_fp_compare (enum rtx_code code)
    2702              : {
    2703      1147075 :   if (!TARGET_IEEE_FP)
    2704              :     return false;
    2705              : 
    2706      1142693 :   switch (code)
    2707              :     {
    2708              :     case LT:
    2709              :     case LE:
    2710              :     case GT:
    2711              :     case GE:
    2712              :     case LTGT:
    2713              :       return false;
    2714              : 
    2715              :     case EQ:
    2716              :     case NE:
    2717              : 
    2718              :     case UNORDERED:
    2719              :     case ORDERED:
    2720              :     case UNLT:
    2721              :     case UNLE:
    2722              :     case UNGT:
    2723              :     case UNGE:
    2724              :     case UNEQ:
    2725              :       return true;
    2726              : 
    2727            0 :     default:
    2728            0 :       gcc_unreachable ();
    2729              :     }
    2730              : }
    2731              : 
    2732              : /* Return a comparison we can do and that it is equivalent to
    2733              :    swap_condition (code) apart possibly from orderedness.
    2734              :    But, never change orderedness if TARGET_IEEE_FP, returning
    2735              :    UNKNOWN in that case if necessary.  */
    2736              : 
    2737              : static enum rtx_code
    2738        37367 : ix86_fp_swap_condition (enum rtx_code code)
    2739              : {
    2740        37367 :   switch (code)
    2741              :     {
    2742         1847 :     case GT:                   /* GTU - CF=0 & ZF=0 */
    2743         1847 :       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
    2744          533 :     case GE:                   /* GEU - CF=0 */
    2745          533 :       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
    2746          446 :     case UNLT:                 /* LTU - CF=1 */
    2747          446 :       return TARGET_IEEE_FP ? UNKNOWN : GT;
    2748         6315 :     case UNLE:                 /* LEU - CF=1 | ZF=1 */
    2749         6315 :       return TARGET_IEEE_FP ? UNKNOWN : GE;
    2750        28226 :     default:
    2751        28226 :       return swap_condition (code);
    2752              :     }
    2753              : }
    2754              : 
    2755              : /* Return cost of comparison CODE using the best strategy for performance.
    2756              :    All following functions do use number of instructions as a cost metrics.
    2757              :    In future this should be tweaked to compute bytes for optimize_size and
    2758              :    take into account performance of various instructions on various CPUs.  */
    2759              : 
    2760              : static int
    2761      1145940 : ix86_fp_comparison_cost (enum rtx_code code)
    2762              : {
    2763      1145940 :   int arith_cost;
    2764              : 
    2765              :   /* The cost of code using bit-twiddling on %ah.  */
    2766      1145940 :   switch (code)
    2767              :     {
    2768              :     case UNLE:
    2769              :     case UNLT:
    2770              :     case LTGT:
    2771              :     case GT:
    2772              :     case GE:
    2773              :     case UNORDERED:
    2774              :     case ORDERED:
    2775              :     case UNEQ:
    2776              :       arith_cost = 4;
    2777              :       break;
    2778        84249 :     case LT:
    2779        84249 :     case NE:
    2780        84249 :     case EQ:
    2781        84249 :     case UNGE:
    2782        84249 :       arith_cost = TARGET_IEEE_FP ? 5 : 4;
    2783              :       break;
    2784        24913 :     case LE:
    2785        24913 :     case UNGT:
    2786      1062502 :       arith_cost = TARGET_IEEE_FP ? 6 : 4;
    2787              :       break;
    2788            0 :     default:
    2789            0 :       gcc_unreachable ();
    2790              :     }
    2791              : 
    2792      1145940 :   switch (ix86_fp_comparison_strategy (code))
    2793              :     {
    2794      1145940 :     case IX86_FPCMP_COMI:
    2795      1145940 :       return arith_cost > 4 ? 3 : 2;
    2796            0 :     case IX86_FPCMP_SAHF:
    2797            0 :       return arith_cost > 4 ? 4 : 3;
    2798              :     default:
    2799              :       return arith_cost;
    2800              :     }
    2801              : }
    2802              : 
    2803              : /* Swap, force into registers, or otherwise massage the two operands
    2804              :    to a fp comparison.  The operands are updated in place; the new
    2805              :    comparison code is returned.  */
    2806              : 
    2807              : static enum rtx_code
    2808       572970 : ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
    2809              : {
    2810       573041 :   bool unordered_compare = ix86_unordered_fp_compare (code);
    2811       573041 :   rtx op0 = *pop0, op1 = *pop1;
    2812       573041 :   machine_mode op_mode = GET_MODE (op0);
    2813       573041 :   bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (op_mode);
    2814              : 
    2815       570650 :   if (op_mode == BFmode && (!TARGET_AVX10_2 || flag_trapping_math))
    2816              :     {
    2817           71 :       rtx op = gen_lowpart (HImode, op0);
    2818           71 :       if (CONST_INT_P (op))
    2819            0 :         op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
    2820              :                                              op0, BFmode);
    2821              :       else
    2822              :         {
    2823           71 :           rtx t1 = gen_reg_rtx (SImode);
    2824           71 :           emit_insn (gen_zero_extendhisi2 (t1, op));
    2825           71 :           emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
    2826           71 :           op = gen_lowpart (SFmode, t1);
    2827              :         }
    2828           71 :       *pop0 = op;
    2829           71 :       op = gen_lowpart (HImode, op1);
    2830           71 :       if (CONST_INT_P (op))
    2831            6 :         op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
    2832              :                                              op1, BFmode);
    2833              :       else
    2834              :         {
    2835           65 :           rtx t1 = gen_reg_rtx (SImode);
    2836           65 :           emit_insn (gen_zero_extendhisi2 (t1, op));
    2837           65 :           emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
    2838           65 :           op = gen_lowpart (SFmode, t1);
    2839              :         }
    2840           71 :       *pop1 = op;
    2841           71 :       return ix86_prepare_fp_compare_args (code, pop0, pop1);
    2842              :     }
    2843              : 
    2844              :   /* All of the unordered compare instructions only work on registers.
    2845              :      The same is true of the fcomi compare instructions.  The XFmode
    2846              :      compare instructions require registers except when comparing
    2847              :      against zero or when converting operand 1 from fixed point to
    2848              :      floating point.  */
    2849              : 
    2850       572970 :   if (!is_sse
    2851       572970 :       && (unordered_compare
    2852         8222 :           || (op_mode == XFmode
    2853        10523 :               && ! (standard_80387_constant_p (op0) == 1
    2854         5259 :                     || standard_80387_constant_p (op1) == 1)
    2855         4918 :               && GET_CODE (op1) != FLOAT)
    2856         3304 :           || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
    2857              :     {
    2858       147498 :       op0 = force_reg (op_mode, op0);
    2859       147498 :       op1 = force_reg (op_mode, op1);
    2860              :     }
    2861              :   else
    2862              :     {
    2863              :       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
    2864              :          things around if they appear profitable, otherwise force op0
    2865              :          into a register.  */
    2866              : 
    2867       425472 :       if (standard_80387_constant_p (op0) == 0
    2868       425472 :           || (MEM_P (op0)
    2869        56480 :               && ! (standard_80387_constant_p (op1) == 0
    2870        41129 :                     || MEM_P (op1))))
    2871              :         {
    2872        37367 :           enum rtx_code new_code = ix86_fp_swap_condition (code);
    2873        37367 :           if (new_code != UNKNOWN)
    2874              :             {
    2875              :               std::swap (op0, op1);
    2876       425472 :               code = new_code;
    2877              :             }
    2878              :         }
    2879              : 
    2880       425472 :       if (!REG_P (op0))
    2881        52832 :         op0 = force_reg (op_mode, op0);
    2882              : 
    2883       425472 :       if (CONSTANT_P (op1))
    2884              :         {
    2885       193117 :           int tmp = standard_80387_constant_p (op1);
    2886       193117 :           if (tmp == 0)
    2887        73810 :             op1 = validize_mem (force_const_mem (op_mode, op1));
    2888       119307 :           else if (tmp == 1)
    2889              :             {
    2890        65213 :               if (TARGET_CMOVE)
    2891        65213 :                 op1 = force_reg (op_mode, op1);
    2892              :             }
    2893              :           else
    2894        54094 :             op1 = force_reg (op_mode, op1);
    2895              :         }
    2896              :     }
    2897              : 
    2898              :   /* Try to rearrange the comparison to make it cheaper.  */
    2899       572970 :   if (ix86_fp_comparison_cost (code)
    2900       572970 :       > ix86_fp_comparison_cost (swap_condition (code))
    2901       572970 :       && (REG_P (op1) || can_create_pseudo_p ()))
    2902              :     {
    2903            0 :       std::swap (op0, op1);
    2904            0 :       code = swap_condition (code);
    2905            0 :       if (!REG_P (op0))
    2906            0 :         op0 = force_reg (op_mode, op0);
    2907              :     }
    2908              : 
    2909       572970 :   *pop0 = op0;
    2910       572970 :   *pop1 = op1;
    2911       572970 :   return code;
    2912              : }
    2913              : 
    2914              : /* Generate insn patterns to do a floating point compare of OPERANDS.  */
    2915              : 
    2916              : static rtx
    2917       572970 : ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
    2918              : {
    2919       572970 :   bool unordered_compare = ix86_unordered_fp_compare (code);
    2920       572970 :   machine_mode cmp_mode;
    2921       572970 :   rtx tmp, scratch;
    2922              : 
    2923       572970 :   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
    2924              : 
    2925       572970 :   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
    2926       572970 :   if (unordered_compare)
    2927       498078 :     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
    2928              : 
    2929              :   /* Do fcomi/sahf based test when profitable.  */
    2930       572970 :   switch (ix86_fp_comparison_strategy (code))
    2931              :     {
    2932       572970 :     case IX86_FPCMP_COMI:
    2933       572970 :       tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
    2934              :       /* We only have vcomisbf16, No vcomubf16 nor vcomxbf16 */
    2935       572970 :       if (GET_MODE (op0) != E_BFmode)
    2936              :         {
    2937       572942 :           if (TARGET_AVX10_2 && (code == EQ || code == NE))
    2938          972 :             tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
    2939       572942 :           if (unordered_compare)
    2940       498070 :             tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
    2941              :         }
    2942       572970 :       cmp_mode = CCFPmode;
    2943       572970 :       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
    2944       572970 :       break;
    2945              : 
    2946            0 :     case IX86_FPCMP_SAHF:
    2947            0 :       cmp_mode = CCFPmode;
    2948            0 :       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
    2949            0 :       scratch = gen_reg_rtx (HImode);
    2950            0 :       emit_insn (gen_rtx_SET (scratch, tmp));
    2951            0 :       emit_insn (gen_x86_sahf_1 (scratch));
    2952            0 :       break;
    2953              : 
    2954            0 :     case IX86_FPCMP_ARITH:
    2955            0 :       cmp_mode = CCNOmode;
    2956            0 :       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
    2957            0 :       scratch = gen_reg_rtx (HImode);
    2958            0 :       emit_insn (gen_rtx_SET (scratch, tmp));
    2959              : 
    2960              :       /* In the unordered case, we have to check C2 for NaN's, which
    2961              :          doesn't happen to work out to anything nice combination-wise.
    2962              :          So do some bit twiddling on the value we've got in AH to come
    2963              :          up with an appropriate set of condition codes.  */
    2964              : 
    2965            0 :       switch (code)
    2966              :         {
    2967            0 :         case GT:
    2968            0 :         case UNGT:
    2969            0 :           if (code == GT || !TARGET_IEEE_FP)
    2970              :             {
    2971            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
    2972            0 :               code = EQ;
    2973              :             }
    2974              :           else
    2975              :             {
    2976            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    2977            0 :               emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
    2978            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
    2979            0 :               cmp_mode = CCmode;
    2980            0 :               code = GEU;
    2981              :             }
    2982              :           break;
    2983            0 :         case LT:
    2984            0 :         case UNLT:
    2985            0 :           if (code == LT && TARGET_IEEE_FP)
    2986              :             {
    2987            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    2988            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
    2989            0 :               cmp_mode = CCmode;
    2990            0 :               code = EQ;
    2991              :             }
    2992              :           else
    2993              :             {
    2994            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
    2995            0 :               code = NE;
    2996              :             }
    2997              :           break;
    2998            0 :         case GE:
    2999            0 :         case UNGE:
    3000            0 :           if (code == GE || !TARGET_IEEE_FP)
    3001              :             {
    3002            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
    3003            0 :               code = EQ;
    3004              :             }
    3005              :           else
    3006              :             {
    3007            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3008            0 :               emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
    3009            0 :               code = NE;
    3010              :             }
    3011              :           break;
    3012            0 :         case LE:
    3013            0 :         case UNLE:
    3014            0 :           if (code == LE && TARGET_IEEE_FP)
    3015              :             {
    3016            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3017            0 :               emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
    3018            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
    3019            0 :               cmp_mode = CCmode;
    3020            0 :               code = LTU;
    3021              :             }
    3022              :           else
    3023              :             {
    3024            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
    3025            0 :               code = NE;
    3026              :             }
    3027              :           break;
    3028            0 :         case EQ:
    3029            0 :         case UNEQ:
    3030            0 :           if (code == EQ && TARGET_IEEE_FP)
    3031              :             {
    3032            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3033            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
    3034            0 :               cmp_mode = CCmode;
    3035            0 :               code = EQ;
    3036              :             }
    3037              :           else
    3038              :             {
    3039            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
    3040            0 :               code = NE;
    3041              :             }
    3042              :           break;
    3043            0 :         case NE:
    3044            0 :         case LTGT:
    3045            0 :           if (code == NE && TARGET_IEEE_FP)
    3046              :             {
    3047            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3048            0 :               emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
    3049              :                                              GEN_INT (0x40)));
    3050            0 :               code = NE;
    3051              :             }
    3052              :           else
    3053              :             {
    3054            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
    3055            0 :               code = EQ;
    3056              :             }
    3057              :           break;
    3058              : 
    3059            0 :         case UNORDERED:
    3060            0 :           emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
    3061            0 :           code = NE;
    3062            0 :           break;
    3063            0 :         case ORDERED:
    3064            0 :           emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
    3065            0 :           code = EQ;
    3066            0 :           break;
    3067              : 
    3068            0 :         default:
    3069            0 :           gcc_unreachable ();
    3070              :         }
    3071              :         break;
    3072              : 
    3073            0 :     default:
    3074            0 :       gcc_unreachable();
    3075              :     }
    3076              : 
    3077              :   /* Return the test that should be put into the flags user, i.e.
    3078              :      the bcc, scc, or cmov instruction.  */
    3079       572970 :   return gen_rtx_fmt_ee (code, VOIDmode,
    3080              :                          gen_rtx_REG (cmp_mode, FLAGS_REG),
    3081              :                          const0_rtx);
    3082              : }
    3083              : 
    3084              : /* Generate insn patterns to do an integer compare of OPERANDS.  */
    3085              : 
    3086              : static rtx
    3087      6961396 : ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
    3088              : {
    3089      6961396 :   machine_mode cmpmode;
    3090      6961396 :   rtx tmp, flags;
    3091              : 
    3092              :   /* Swap operands to emit carry flag comparison.  */
    3093      6961396 :   if ((code == GTU || code == LEU)
    3094      6961396 :       && nonimmediate_operand (op1, VOIDmode))
    3095              :     {
    3096       144542 :       std::swap (op0, op1);
    3097       144542 :       code = swap_condition (code);
    3098              :     }
    3099              : 
    3100      6961396 :   cmpmode = SELECT_CC_MODE (code, op0, op1);
    3101      6961396 :   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
    3102              : 
    3103              :   /* Attempt to use PTEST, if available, when testing vector modes for
    3104              :      equality/inequality against zero.  */
    3105      6961396 :   if (op1 == const0_rtx
    3106      2909622 :       && SUBREG_P (op0)
    3107        22748 :       && cmpmode == CCZmode
    3108        10296 :       && SUBREG_BYTE (op0) == 0
    3109         8635 :       && REG_P (SUBREG_REG (op0))
    3110         8635 :       && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
    3111            7 :       && TARGET_SSE4_1
    3112            1 :       && GET_MODE (op0) == TImode
    3113      6961398 :       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
    3114              :     {
    3115            1 :       tmp = SUBREG_REG (op0);
    3116            1 :       if (GET_MODE (tmp) == V8HFmode || GET_MODE (tmp) == V8BFmode)
    3117            1 :         tmp = gen_lowpart (V8HImode, tmp);
    3118            1 :       tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
    3119              :     }
    3120              :   else
    3121      6961395 :     tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
    3122              : 
    3123              :   /* This is very simple, but making the interface the same as in the
    3124              :      FP case makes the rest of the code easier.  */
    3125      6961396 :   emit_insn (gen_rtx_SET (flags, tmp));
    3126              : 
    3127              :   /* Return the test that should be put into the flags user, i.e.
    3128              :      the bcc, scc, or cmov instruction.  */
    3129      6961396 :   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
    3130              : }
    3131              : 
    3132              : static rtx
    3133      7662638 : ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
    3134              : {
    3135      7662638 :   rtx ret;
    3136              : 
    3137      7662638 :   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
    3138       130358 :     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
    3139              : 
    3140      7532280 :   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
    3141              :     {
    3142       570884 :       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
    3143       570884 :       ret = ix86_expand_fp_compare (code, op0, op1);
    3144              :     }
    3145              :   else
    3146      6961396 :     ret = ix86_expand_int_compare (code, op0, op1);
    3147              : 
    3148      7662638 :   return ret;
    3149              : }
    3150              : 
    3151              : void
    3152       585615 : ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
    3153              : {
    3154       585615 :   rtx ret;
    3155              : 
    3156       585615 :   gcc_assert (GET_MODE (dest) == QImode);
    3157              : 
    3158       585615 :   ret = ix86_expand_compare (code, op0, op1);
    3159       585615 :   PUT_MODE (ret, QImode);
    3160       585615 :   emit_insn (gen_rtx_SET (dest, ret));
    3161       585615 : }
    3162              : 
    3163              : /* Expand floating point op0 <=> op1, i.e.
    3164              :    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128.  */
    3165              : 
    3166              : void
    3167          244 : ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
    3168              : {
    3169          244 :   gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
    3170          244 :   rtx zero = NULL_RTX;
    3171          244 :   if (op2 != const0_rtx
    3172           52 :       && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND)
    3173           34 :       && GET_MODE (dest) == SImode)
    3174           34 :     zero = force_reg (SImode, const0_rtx);
    3175          244 :   rtx gt = ix86_expand_fp_compare (GT, op0, op1);
    3176          244 :   rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
    3177          244 :   rtx l1 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
    3178          244 :   rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
    3179          244 :   rtx lend = gen_label_rtx ();
    3180          244 :   rtx tmp;
    3181          244 :   rtx_insn *jmp;
    3182          244 :   if (l2)
    3183              :     {
    3184          207 :       rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
    3185              :                                gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
    3186          207 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
    3187              :                                   gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
    3188          207 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3189          207 :       add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
    3190              :     }
    3191          244 :   if (op2 == const0_rtx)
    3192              :     {
    3193          192 :       rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
    3194              :                                gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
    3195          192 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
    3196              :                                   gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
    3197          192 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3198          192 :       add_reg_br_prob_note (jmp, profile_probability::unlikely ());
    3199          192 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
    3200              :                                   gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
    3201          192 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3202          192 :       add_reg_br_prob_note (jmp, profile_probability::even ());
    3203          192 :       emit_move_insn (dest, constm1_rtx);
    3204          192 :       emit_jump (lend);
    3205          192 :       emit_label (l0);
    3206          192 :       emit_move_insn (dest, const0_rtx);
    3207          192 :       emit_jump (lend);
    3208          192 :       emit_label (l1);
    3209          192 :       emit_move_insn (dest, const1_rtx);
    3210              :     }
    3211              :   else
    3212              :     {
    3213           52 :       rtx lt_tmp = NULL_RTX;
    3214           52 :       if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND)
    3215              :         {
    3216           52 :           lt_tmp = gen_reg_rtx (QImode);
    3217           52 :           ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
    3218              :                              const0_rtx);
    3219           52 :           if (GET_MODE (dest) != QImode)
    3220              :             {
    3221           52 :               tmp = gen_reg_rtx (GET_MODE (dest));
    3222           52 :               emit_insn (gen_rtx_SET (tmp,
    3223              :                                       gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3224              :                                                            lt_tmp)));
    3225           52 :               lt_tmp = tmp;
    3226              :             }
    3227              :         }
    3228           52 :       rtx gt_tmp;
    3229           52 :       if (zero)
    3230              :         {
    3231              :           /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
    3232              :              before the floating point comparison and use setcc_si_slp
    3233              :              pattern to hide it from the combiner, so that it doesn't
    3234              :              undo it.  Similarly for TARGET_ZERO_EXTEND_WITH_AND, where
    3235              :              the ZERO_EXTEND normally emitted would need to be AND
    3236              :              with flags clobber.  */
    3237           34 :           tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
    3238           34 :           PUT_MODE (tmp, QImode);
    3239           34 :           emit_insn (gen_setcc_si_slp (zero, tmp, zero));
    3240           34 :           gt_tmp = zero;
    3241              :         }
    3242              :       else
    3243              :         {
    3244           18 :           gt_tmp = gen_reg_rtx (QImode);
    3245           18 :           ix86_expand_setcc (gt_tmp, GT, XEXP (gt, 0), const0_rtx);
    3246           18 :           if (GET_MODE (dest) != QImode)
    3247              :             {
    3248           18 :               tmp = gen_reg_rtx (GET_MODE (dest));
    3249           18 :               emit_insn (gen_rtx_SET (tmp,
    3250              :                                       gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3251              :                                                            gt_tmp)));
    3252           18 :               gt_tmp = tmp;
    3253              :             }
    3254              :         }
    3255           52 :       if (lt_tmp)
    3256              :         {
    3257           52 :           tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp,
    3258              :                                      dest, 0, OPTAB_DIRECT);
    3259           52 :           if (!rtx_equal_p (tmp, dest))
    3260            0 :             emit_move_insn (dest, tmp);
    3261              :         }
    3262              :       else
    3263              :         {
    3264              :           /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
    3265              :              do ZERO_EXTEND without clobbering flags.  */
    3266            0 :           tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx);
    3267            0 :           PUT_MODE (tmp, SImode);
    3268            0 :           emit_insn (gen_subsi3_carry (dest, gt_tmp,
    3269            0 :                                        force_reg (GET_MODE (dest), const0_rtx),
    3270              :                                        XEXP (gt, 0), tmp));
    3271              :         }
    3272              :     }
    3273          244 :   emit_jump (lend);
    3274          244 :   if (l2)
    3275              :     {
    3276          207 :       emit_label (l2);
    3277          207 :       emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
    3278              :     }
    3279          244 :   emit_label (lend);
    3280          244 : }
    3281              : 
    3282              : /* Expand integral op0 <=> op1, i.e.
    3283              :    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : 1.  */
    3284              : 
    3285              : void
    3286           35 : ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
    3287              : {
    3288           35 :   gcc_assert (INTVAL (op2));
    3289           35 :   rtx zero1 = NULL_RTX, zero2 = NULL_RTX;
    3290           35 :   if (TARGET_ZERO_EXTEND_WITH_AND && GET_MODE (dest) == SImode)
    3291              :     {
    3292            0 :       zero1 = force_reg (SImode, const0_rtx);
    3293            0 :       if (INTVAL (op2) != 1)
    3294            0 :         zero2 = force_reg (SImode, const0_rtx);
    3295              :     }
    3296              : 
    3297              :   /* Not using ix86_expand_int_compare here, so that it doesn't swap
    3298              :      operands nor optimize CC mode - we need a mode usable for both
    3299              :      LT and GT resp. LTU and GTU comparisons with the same unswapped
    3300              :      operands.  */
    3301           51 :   rtx flags = gen_rtx_REG (INTVAL (op2) != 1 ? CCGCmode : CCmode, FLAGS_REG);
    3302           35 :   rtx tmp = gen_rtx_COMPARE (GET_MODE (flags), op0, op1);
    3303           35 :   emit_insn (gen_rtx_SET (flags, tmp));
    3304           35 :   rtx lt_tmp = NULL_RTX;
    3305           35 :   if (zero2)
    3306              :     {
    3307              :       /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
    3308              :          ZERO_EXTEND.  */
    3309            0 :       tmp = ix86_expand_compare (LT, flags, const0_rtx);
    3310            0 :       PUT_MODE (tmp, QImode);
    3311            0 :       emit_insn (gen_setcc_si_slp (zero2, tmp, zero2));
    3312            0 :       lt_tmp = zero2;
    3313              :     }
    3314           35 :   else if (!zero1)
    3315              :     {
    3316           35 :       lt_tmp = gen_reg_rtx (QImode);
    3317           51 :       ix86_expand_setcc (lt_tmp, INTVAL (op2) != 1 ? LT : LTU, flags,
    3318              :                          const0_rtx);
    3319           35 :       if (GET_MODE (dest) != QImode)
    3320              :         {
    3321           35 :           tmp = gen_reg_rtx (GET_MODE (dest));
    3322           35 :           emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3323              :                                                             lt_tmp)));
    3324           35 :           lt_tmp = tmp;
    3325              :         }
    3326              :     }
    3327           35 :   rtx gt_tmp;
    3328           35 :   if (zero1)
    3329              :     {
    3330              :       /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
    3331              :          ZERO_EXTEND.  */
    3332            0 :       tmp = ix86_expand_compare (INTVAL (op2) != 1 ? GT : GTU, flags,
    3333              :                                  const0_rtx);
    3334            0 :       PUT_MODE (tmp, QImode);
    3335            0 :       emit_insn (gen_setcc_si_slp (zero1, tmp, zero1));
    3336            0 :       gt_tmp = zero1;
    3337              :     }
    3338              :   else
    3339              :     {
    3340           35 :       gt_tmp = gen_reg_rtx (QImode);
    3341           51 :       ix86_expand_setcc (gt_tmp, INTVAL (op2) != 1 ? GT : GTU, flags,
    3342              :                          const0_rtx);
    3343           35 :       if (GET_MODE (dest) != QImode)
    3344              :         {
    3345           35 :           tmp = gen_reg_rtx (GET_MODE (dest));
    3346           35 :           emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3347              :                                                             gt_tmp)));
    3348           35 :           gt_tmp = tmp;
    3349              :         }
    3350              :     }
    3351           35 :   if (lt_tmp)
    3352              :     {
    3353           35 :       tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
    3354              :                                  0, OPTAB_DIRECT);
    3355           35 :       if (!rtx_equal_p (tmp, dest))
    3356            0 :         emit_move_insn (dest, tmp);
    3357              :     }
    3358              :   else
    3359              :     {
    3360              :       /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
    3361              :          do ZERO_EXTEND without clobbering flags.  */
    3362            0 :       tmp = ix86_expand_compare (LTU, flags, const0_rtx);
    3363            0 :       PUT_MODE (tmp, SImode);
    3364            0 :       emit_insn (gen_subsi3_carry (dest, gt_tmp,
    3365            0 :                                    force_reg (GET_MODE (dest), const0_rtx),
    3366              :                                    flags, tmp));
    3367              :     }
    3368           35 : }
    3369              : 
    3370              : /* Expand comparison setting or clearing carry flag.  Return true when
    3371              :    successful and set pop for the operation.  */
    3372              : static bool
    3373        29317 : ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
    3374              : {
    3375        58634 :   machine_mode mode
    3376        29317 :     = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
    3377              : 
    3378              :   /* Do not handle double-mode compares that go through special path.  */
    3379        31703 :   if (mode == (TARGET_64BIT ? TImode : DImode))
    3380              :     return false;
    3381              : 
    3382        29307 :   if (SCALAR_FLOAT_MODE_P (mode))
    3383              :     {
    3384         1844 :       rtx compare_op;
    3385         1844 :       rtx_insn *compare_seq;
    3386              : 
    3387         1844 :       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
    3388              : 
    3389              :       /* Shortcut:  following common codes never translate
    3390              :          into carry flag compares.  */
    3391         1844 :       if (code == EQ || code == NE || code == UNEQ || code == LTGT
    3392              :           || code == ORDERED || code == UNORDERED)
    3393              :         return false;
    3394              : 
    3395              :       /* These comparisons require zero flag; swap operands so they won't.  */
    3396              :       if ((code == GT || code == UNLE || code == LE || code == UNGT)
    3397         1779 :           && !TARGET_IEEE_FP)
    3398              :         {
    3399            2 :           std::swap (op0, op1);
    3400            2 :           code = swap_condition (code);
    3401              :         }
    3402              : 
    3403              :       /* Try to expand the comparison and verify that we end up with
    3404              :          carry flag based comparison.  This fails to be true only when
    3405              :          we decide to expand comparison using arithmetic that is not
    3406              :          too common scenario.  */
    3407         1842 :       start_sequence ();
    3408         1842 :       compare_op = ix86_expand_fp_compare (code, op0, op1);
    3409         1842 :       compare_seq = end_sequence ();
    3410              : 
    3411         1842 :       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
    3412         1842 :         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
    3413              :       else
    3414            0 :         code = GET_CODE (compare_op);
    3415              : 
    3416         1842 :       if (code != LTU && code != GEU)
    3417              :         return false;
    3418              : 
    3419           63 :       emit_insn (compare_seq);
    3420           63 :       *pop = compare_op;
    3421           63 :       return true;
    3422              :     }
    3423              : 
    3424        27463 :   if (!INTEGRAL_MODE_P (mode))
    3425              :     return false;
    3426              : 
    3427        27331 :   switch (code)
    3428              :     {
    3429              :     case LTU:
    3430              :     case GEU:
    3431              :       break;
    3432              : 
    3433              :     /* Convert a==0 into (unsigned)a<1.  */
    3434        23807 :     case EQ:
    3435        23807 :     case NE:
    3436        23807 :       if (op1 != const0_rtx)
    3437              :         return false;
    3438        10038 :       op1 = const1_rtx;
    3439        10038 :       code = (code == EQ ? LTU : GEU);
    3440              :       break;
    3441              : 
    3442              :     /* Convert a>b into b<a or a>=b-1.  */
    3443          698 :     case GTU:
    3444          698 :     case LEU:
    3445          698 :       if (CONST_INT_P (op1))
    3446              :         {
    3447          656 :           op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
    3448              :           /* Bail out on overflow.  We still can swap operands but that
    3449              :              would force loading of the constant into register.  */
    3450          656 :           if (op1 == const0_rtx
    3451          656 :               || !x86_64_immediate_operand (op1, GET_MODE (op1)))
    3452            0 :             return false;
    3453          656 :           code = (code == GTU ? GEU : LTU);
    3454              :         }
    3455              :       else
    3456              :         {
    3457           42 :           std::swap (op0, op1);
    3458           42 :           code = (code == GTU ? LTU : GEU);
    3459              :         }
    3460              :       break;
    3461              : 
    3462              :     /* Convert a>=0 into (unsigned)a<0x80000000.  */
    3463         1300 :     case LT:
    3464         1300 :     case GE:
    3465         1300 :       if (mode == DImode || op1 != const0_rtx)
    3466              :         return false;
    3467          204 :       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
    3468          102 :       code = (code == LT ? GEU : LTU);
    3469              :       break;
    3470          833 :     case LE:
    3471          833 :     case GT:
    3472          833 :       if (mode == DImode || op1 != constm1_rtx)
    3473              :         return false;
    3474            0 :       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
    3475            0 :       code = (code == LE ? GEU : LTU);
    3476              :       break;
    3477              : 
    3478              :     default:
    3479              :       return false;
    3480              :     }
    3481              :   /* Swapping operands may cause constant to appear as first operand.  */
    3482        11531 :   if (!nonimmediate_operand (op0, VOIDmode))
    3483              :     {
    3484            0 :       if (!can_create_pseudo_p ())
    3485              :         return false;
    3486            0 :       op0 = force_reg (mode, op0);
    3487              :     }
    3488        11531 :   *pop = ix86_expand_compare (code, op0, op1);
    3489        11531 :   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
    3490              :   return true;
    3491              : }
    3492              : 
    3493              : /* Expand conditional increment or decrement using adb/sbb instructions.
    3494              :    The default case using setcc followed by the conditional move can be
    3495              :    done by generic code.  */
    3496              : bool
    3497         6780 : ix86_expand_int_addcc (rtx operands[])
    3498              : {
    3499         6780 :   enum rtx_code code = GET_CODE (operands[1]);
    3500         6780 :   rtx flags;
    3501         6780 :   rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
    3502         6780 :   rtx compare_op;
    3503         6780 :   rtx val = const0_rtx;
    3504         6780 :   bool fpcmp = false;
    3505         6780 :   machine_mode mode;
    3506         6780 :   rtx op0 = XEXP (operands[1], 0);
    3507         6780 :   rtx op1 = XEXP (operands[1], 1);
    3508              : 
    3509         6780 :   if (operands[3] != const1_rtx
    3510         2814 :       && operands[3] != constm1_rtx)
    3511              :     return false;
    3512         4689 :   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
    3513              :      return false;
    3514         1279 :   code = GET_CODE (compare_op);
    3515              : 
    3516         1279 :   flags = XEXP (compare_op, 0);
    3517              : 
    3518         1279 :   if (GET_MODE (flags) == CCFPmode)
    3519              :     {
    3520            4 :       fpcmp = true;
    3521            4 :       code = ix86_fp_compare_code_to_integer (code);
    3522              :     }
    3523              : 
    3524         1279 :   if (code != LTU)
    3525              :     {
    3526          735 :       val = constm1_rtx;
    3527          735 :       if (fpcmp)
    3528            4 :         PUT_CODE (compare_op,
    3529              :                   reverse_condition_maybe_unordered
    3530              :                     (GET_CODE (compare_op)));
    3531              :       else
    3532          731 :         PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
    3533              :     }
    3534              : 
    3535         1279 :   mode = GET_MODE (operands[0]);
    3536              : 
    3537              :   /* Construct either adc or sbb insn.  */
    3538         1279 :   if ((code == LTU) == (operands[3] == constm1_rtx))
    3539              :     insn = gen_sub3_carry;
    3540              :   else
    3541          526 :     insn = gen_add3_carry;
    3542              : 
    3543         1279 :   emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
    3544              : 
    3545         1279 :   return true;
    3546              : }
    3547              : 
    3548              : bool
    3549       428902 : ix86_expand_int_movcc (rtx operands[])
    3550              : {
    3551       428902 :   enum rtx_code code = GET_CODE (operands[1]), compare_code;
    3552       428902 :   rtx_insn *compare_seq;
    3553       428902 :   rtx compare_op;
    3554       428902 :   machine_mode mode = GET_MODE (operands[0]);
    3555       428902 :   bool sign_bit_compare_p = false;
    3556       428902 :   bool negate_cc_compare_p = false;
    3557       428902 :   rtx op0 = XEXP (operands[1], 0);
    3558       428902 :   rtx op1 = XEXP (operands[1], 1);
    3559       428902 :   rtx op2 = operands[2];
    3560       428902 :   rtx op3 = operands[3];
    3561              : 
    3562       428902 :   if (GET_MODE (op0) == TImode
    3563       413508 :       || (GET_MODE (op0) == DImode
    3564        99795 :           && !TARGET_64BIT))
    3565              :     return false;
    3566              : 
    3567       412412 :   if (GET_MODE (op0) == BFmode
    3568       412412 :       && !ix86_fp_comparison_operator (operands[1], VOIDmode))
    3569              :     return false;
    3570              : 
    3571       412412 :   start_sequence ();
    3572       412412 :   compare_op = ix86_expand_compare (code, op0, op1);
    3573       412412 :   compare_seq = end_sequence ();
    3574              : 
    3575       412412 :   compare_code = GET_CODE (compare_op);
    3576              : 
    3577       412412 :   if ((op1 == const0_rtx && (code == GE || code == LT))
    3578       370441 :       || (op1 == constm1_rtx && (code == GT || code == LE)))
    3579              :     sign_bit_compare_p = true;
    3580              : 
    3581              :   /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
    3582              :      but if op1 is a constant, the latter form allows more optimizations,
    3583              :      either through the last 2 ops being constant handling, or the one
    3584              :      constant and one variable cases.  On the other side, for cmov the
    3585              :      former might be better as we don't need to load the constant into
    3586              :      another register.  */
    3587       370441 :   if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
    3588              :     op2 = op1;
    3589              :   /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1.  */
    3590       411886 :   else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
    3591              :     op3 = op1;
    3592              : 
    3593              :   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
    3594              :      HImode insns, we'd be swallowed in word prefix ops.  */
    3595              : 
    3596         4854 :   if ((mode != HImode || TARGET_FAST_PREFIX)
    3597       442061 :       && (mode != (TARGET_64BIT ? TImode : DImode))
    3598       412412 :       && CONST_INT_P (op2)
    3599       444798 :       && CONST_INT_P (op3))
    3600              :     {
    3601        25425 :       rtx out = operands[0];
    3602        25425 :       HOST_WIDE_INT ct = INTVAL (op2);
    3603        25425 :       HOST_WIDE_INT cf = INTVAL (op3);
    3604        25425 :       HOST_WIDE_INT diff;
    3605              : 
    3606        25425 :       if ((mode == SImode
    3607        11902 :            || (TARGET_64BIT && mode == DImode))
    3608        18233 :           && (GET_MODE (op0) == SImode
    3609        14275 :               || (TARGET_64BIT && GET_MODE (op0) == DImode)))
    3610              :         {
    3611              :           /* Special case x != 0 ? -1 : y.  */
    3612        13102 :           if (code == NE && op1 == const0_rtx && ct == -1)
    3613              :             {
    3614              :               negate_cc_compare_p = true;
    3615              :               std::swap (ct, cf);
    3616              :               code = EQ;
    3617              :             }
    3618        13003 :           else if (code == EQ && op1 == const0_rtx && cf == -1)
    3619        25425 :             negate_cc_compare_p = true;
    3620              :         }
    3621              : 
    3622        25425 :       diff = (unsigned HOST_WIDE_INT) ct - cf;
    3623              :       /* Make sure we can represent the difference between the two values.  */
    3624        25425 :       if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3625       428902 :         return false;
    3626              : 
    3627              :       /*  Sign bit compares are better done using shifts than we do by using
    3628              :           sbb.  */
    3629        25277 :       if (sign_bit_compare_p
    3630        25277 :           || negate_cc_compare_p
    3631        25277 :           || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
    3632              :         {
    3633              :           /* Detect overlap between destination and compare sources.  */
    3634        10964 :           rtx tmp = out;
    3635              : 
    3636        10964 :           if (negate_cc_compare_p)
    3637              :             {
    3638          280 :               if (GET_MODE (op0) == DImode)
    3639          106 :                 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
    3640              :               else
    3641          174 :                 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
    3642          174 :                                               gen_lowpart (SImode, op0)));
    3643              : 
    3644          280 :               tmp = gen_reg_rtx (mode);
    3645          280 :               if (mode == DImode)
    3646          123 :                 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
    3647              :               else
    3648          157 :                 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
    3649              :                                                                   tmp)));
    3650              :             }
    3651        10684 :           else if (!sign_bit_compare_p)
    3652              :             {
    3653        10315 :               rtx flags;
    3654        10315 :               bool fpcmp = false;
    3655              : 
    3656        10315 :               compare_code = GET_CODE (compare_op);
    3657              : 
    3658        10315 :               flags = XEXP (compare_op, 0);
    3659              : 
    3660        10315 :               if (GET_MODE (flags) == CCFPmode)
    3661              :                 {
    3662           59 :                   fpcmp = true;
    3663           59 :                   compare_code
    3664           59 :                     = ix86_fp_compare_code_to_integer (compare_code);
    3665              :                 }
    3666              : 
    3667              :               /* To simplify rest of code, restrict to the GEU case.  */
    3668        10315 :               if (compare_code == LTU)
    3669              :                 {
    3670         5963 :                   std::swap (ct, cf);
    3671         5963 :                   compare_code = reverse_condition (compare_code);
    3672         5963 :                   code = reverse_condition (code);
    3673              :                 }
    3674              :               else
    3675              :                 {
    3676         4352 :                   if (fpcmp)
    3677           59 :                     PUT_CODE (compare_op,
    3678              :                               reverse_condition_maybe_unordered
    3679              :                                 (GET_CODE (compare_op)));
    3680              :                   else
    3681         4293 :                     PUT_CODE (compare_op,
    3682              :                               reverse_condition (GET_CODE (compare_op)));
    3683              :                 }
    3684              : 
    3685        10315 :               diff = (unsigned HOST_WIDE_INT) ct - cf;
    3686              :               /* Make sure we can represent the difference
    3687              :                  between the two values.  */
    3688        10315 :               if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3689              :                 return false;
    3690              : 
    3691        10314 :               if (reg_overlap_mentioned_p (out, compare_op))
    3692            0 :                 tmp = gen_reg_rtx (mode);
    3693              : 
    3694        10314 :               if (mode == DImode)
    3695         2036 :                 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
    3696              :               else
    3697         8278 :                 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
    3698              :                                                  flags, compare_op));
    3699              :             }
    3700              :           else
    3701              :             {
    3702          369 :               if (code == GT || code == GE)
    3703          153 :                 code = reverse_condition (code);
    3704              :               else
    3705              :                 {
    3706          216 :                   std::swap (ct, cf);
    3707              : 
    3708          216 :                   diff = (unsigned HOST_WIDE_INT) ct - cf;
    3709              :                   /* Make sure we can represent the difference
    3710              :                      between the two values.  */
    3711          216 :                   if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3712              :                     return false;
    3713              :                 }
    3714          364 :               tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
    3715              :             }
    3716              : 
    3717        10958 :           if (diff == 1)
    3718              :             {
    3719              :               /*
    3720              :                * cmpl op0,op1
    3721              :                * sbbl dest,dest
    3722              :                * [addl dest, ct]
    3723              :                *
    3724              :                * Size 5 - 8.
    3725              :                */
    3726         1006 :               if (ct)
    3727          833 :                 tmp = expand_simple_binop (mode, PLUS,
    3728              :                                            tmp, GEN_INT (ct),
    3729              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3730              :             }
    3731         9952 :           else if (cf == -1)
    3732              :             {
    3733              :               /*
    3734              :                * cmpl op0,op1
    3735              :                * sbbl dest,dest
    3736              :                * orl $ct, dest
    3737              :                *
    3738              :                * Size 8.
    3739              :                */
    3740          599 :               tmp = expand_simple_binop (mode, IOR,
    3741              :                                          tmp, GEN_INT (ct),
    3742              :                                          copy_rtx (tmp), 1, OPTAB_DIRECT);
    3743              :             }
    3744         9353 :           else if (diff == -1 && ct)
    3745              :             {
    3746              :               /*
    3747              :                * cmpl op0,op1
    3748              :                * sbbl dest,dest
    3749              :                * notl dest
    3750              :                * [addl dest, cf]
    3751              :                *
    3752              :                * Size 8 - 11.
    3753              :                */
    3754          599 :               tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
    3755          599 :               if (cf)
    3756          581 :                 tmp = expand_simple_binop (mode, PLUS,
    3757              :                                            copy_rtx (tmp), GEN_INT (cf),
    3758              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3759              :             }
    3760              :           else
    3761              :             {
    3762              :               /*
    3763              :                * cmpl op0,op1
    3764              :                * sbbl dest,dest
    3765              :                * [notl dest]
    3766              :                * andl cf - ct, dest
    3767              :                * [addl dest, ct]
    3768              :                *
    3769              :                * Size 8 - 11.
    3770              :                */
    3771              : 
    3772         8754 :               if (cf == 0)
    3773              :                 {
    3774          903 :                   cf = ct;
    3775          903 :                   ct = 0;
    3776          903 :                   tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
    3777              :                 }
    3778              : 
    3779         8754 :               HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
    3780              :               /* Make sure we can represent the difference
    3781              :                  between the two values.  */
    3782         8754 :               if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
    3783        16644 :                 return false;
    3784              : 
    3785         8754 :               tmp = expand_simple_binop (mode, AND,
    3786              :                                          copy_rtx (tmp),
    3787         8754 :                                          gen_int_mode (ival, mode),
    3788              :                                          copy_rtx (tmp), 1, OPTAB_DIRECT);
    3789         8754 :               if (ct)
    3790         7057 :                 tmp = expand_simple_binop (mode, PLUS,
    3791              :                                            copy_rtx (tmp), GEN_INT (ct),
    3792              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3793              :             }
    3794              : 
    3795        10958 :           if (!rtx_equal_p (tmp, out))
    3796          474 :             emit_move_insn (copy_rtx (out), copy_rtx (tmp));
    3797              : 
    3798        10958 :           return true;
    3799              :         }
    3800              : 
    3801        14313 :       if (diff < 0)
    3802              :         {
    3803         4766 :           machine_mode cmp_mode = GET_MODE (op0);
    3804         4766 :           enum rtx_code new_code;
    3805              : 
    3806         4766 :           if (SCALAR_FLOAT_MODE_P (cmp_mode))
    3807              :             {
    3808           54 :               gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
    3809              : 
    3810              :               /* We may be reversing a non-trapping
    3811              :                  comparison to a trapping comparison.  */
    3812          104 :                   if (HONOR_NANS (cmp_mode) && flag_trapping_math
    3813           41 :                       && code != EQ && code != NE
    3814           95 :                       && code != ORDERED && code != UNORDERED)
    3815              :                     new_code = UNKNOWN;
    3816              :                   else
    3817           13 :                     new_code = reverse_condition_maybe_unordered (code);
    3818              :             }
    3819              :           else
    3820         4712 :             new_code = ix86_reverse_condition (code, cmp_mode);
    3821         4725 :           if (new_code != UNKNOWN)
    3822              :             {
    3823         4725 :               std::swap (ct, cf);
    3824              : 
    3825         4725 :               diff = (unsigned HOST_WIDE_INT) ct - cf;
    3826              :               /* Make sure we can represent the difference
    3827              :                  between the two values.  */
    3828         4725 :               if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3829              :                 return false;
    3830              : 
    3831              :               code = new_code;
    3832              :             }
    3833              :         }
    3834              : 
    3835        14313 :       compare_code = UNKNOWN;
    3836        14313 :       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
    3837        12546 :           && CONST_INT_P (op1))
    3838              :         {
    3839         6662 :           if (op1 == const0_rtx
    3840          214 :               && (code == LT || code == GE))
    3841              :             compare_code = code;
    3842         6662 :           else if (op1 == constm1_rtx)
    3843              :             {
    3844          289 :               if (code == LE)
    3845              :                 compare_code = LT;
    3846          289 :               else if (code == GT)
    3847              :                 compare_code = GE;
    3848              :             }
    3849              :         }
    3850              : 
    3851              :       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
    3852              :       if (compare_code != UNKNOWN
    3853            0 :           && GET_MODE (op0) == GET_MODE (out)
    3854            0 :           && (cf == -1 || ct == -1))
    3855              :         {
    3856              :           /* If lea code below could be used, only optimize
    3857              :              if it results in a 2 insn sequence.  */
    3858              : 
    3859            0 :           if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
    3860            0 :                  || diff == 3 || diff == 5 || diff == 9)
    3861            0 :               || (compare_code == LT && ct == -1)
    3862            0 :               || (compare_code == GE && cf == -1))
    3863              :             {
    3864              :               /*
    3865              :                * notl op1       (if necessary)
    3866              :                * sarl $31, op1
    3867              :                * orl cf, op1
    3868              :                */
    3869            0 :               if (ct != -1)
    3870              :                 {
    3871            0 :                   cf = ct;
    3872            0 :                   ct = -1;
    3873            0 :                   code = reverse_condition (code);
    3874              :                 }
    3875              : 
    3876            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
    3877              : 
    3878            0 :               out = expand_simple_binop (mode, IOR,
    3879              :                                          out, GEN_INT (cf),
    3880              :                                          out, 1, OPTAB_DIRECT);
    3881            0 :               if (out != operands[0])
    3882            0 :                 emit_move_insn (operands[0], out);
    3883              : 
    3884            0 :               return true;
    3885              :             }
    3886              :         }
    3887              : 
    3888              : 
    3889        21037 :       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
    3890         6724 :            || diff == 3 || diff == 5 || diff == 9)
    3891         7932 :           && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
    3892        22245 :           && (mode != DImode
    3893         1930 :               || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
    3894              :         {
    3895              :           /*
    3896              :            * xorl dest,dest
    3897              :            * cmpl op1,op2
    3898              :            * setcc dest
    3899              :            * lea cf(dest*(ct-cf)),dest
    3900              :            *
    3901              :            * Size 14.
    3902              :            *
    3903              :            * This also catches the degenerate setcc-only case.
    3904              :            */
    3905              : 
    3906         7932 :           rtx tmp;
    3907         7932 :           int nops;
    3908              : 
    3909         7932 :           out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
    3910              : 
    3911         7932 :           nops = 0;
    3912              :           /* On x86_64 the lea instruction operates on Pmode, so we need
    3913              :              to get arithmetics done in proper mode to match.  */
    3914         7932 :           if (diff == 1)
    3915         6712 :             tmp = copy_rtx (out);
    3916              :           else
    3917              :             {
    3918         1220 :               rtx out1;
    3919         1220 :               out1 = copy_rtx (out);
    3920         1220 :               tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
    3921         1220 :               nops++;
    3922         1220 :               if (diff & 1)
    3923              :                 {
    3924          254 :                   tmp = gen_rtx_PLUS (mode, tmp, out1);
    3925          254 :                   nops++;
    3926              :                 }
    3927              :             }
    3928         7932 :           if (cf != 0)
    3929              :             {
    3930         6968 :               tmp = plus_constant (mode, tmp, cf);
    3931         6968 :               nops++;
    3932              :             }
    3933         7932 :           if (!rtx_equal_p (tmp, out))
    3934              :             {
    3935         7208 :               if (nops == 1)
    3936         6086 :                 out = force_operand (tmp, copy_rtx (out));
    3937              :               else
    3938         1122 :                 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
    3939              :             }
    3940         7932 :           if (!rtx_equal_p (out, operands[0]))
    3941          888 :             emit_move_insn (operands[0], copy_rtx (out));
    3942              : 
    3943         7932 :           return true;
    3944              :         }
    3945              : 
    3946              :       /*
    3947              :        * General case:                  Jumpful:
    3948              :        *   xorl dest,dest               cmpl op1, op2
    3949              :        *   cmpl op1, op2                movl ct, dest
    3950              :        *   setcc dest                   jcc 1f
    3951              :        *   decl dest                    movl cf, dest
    3952              :        *   andl (cf-ct),dest            1:
    3953              :        *   addl ct,dest
    3954              :        *
    3955              :        * Size 20.                       Size 14.
    3956              :        *
    3957              :        * This is reasonably steep, but branch mispredict costs are
    3958              :        * high on modern cpus, so consider failing only if optimizing
    3959              :        * for space.
    3960              :        */
    3961              : 
    3962         6381 :       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
    3963         6381 :           && BRANCH_COST (optimize_insn_for_speed_p (),
    3964              :                           false) >= 2)
    3965              :         {
    3966            0 :           if (cf == 0)
    3967              :             {
    3968            0 :               machine_mode cmp_mode = GET_MODE (op0);
    3969            0 :               enum rtx_code new_code;
    3970              : 
    3971            0 :               if (SCALAR_FLOAT_MODE_P (cmp_mode))
    3972              :                 {
    3973            0 :                   gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
    3974              : 
    3975              :                   /* We may be reversing a non-trapping
    3976              :                      comparison to a trapping comparison.  */
    3977            0 :                   if (HONOR_NANS (cmp_mode) && flag_trapping_math
    3978            0 :                       && code != EQ && code != NE
    3979            0 :                       && code != ORDERED && code != UNORDERED)
    3980              :                     new_code = UNKNOWN;
    3981              :                   else
    3982            0 :                     new_code = reverse_condition_maybe_unordered (code);
    3983              : 
    3984              :                 }
    3985              :               else
    3986              :                 {
    3987            0 :                   new_code = ix86_reverse_condition (code, cmp_mode);
    3988            0 :                   if (compare_code != UNKNOWN && new_code != UNKNOWN)
    3989            0 :                     compare_code = reverse_condition (compare_code);
    3990              :                 }
    3991              : 
    3992            0 :               if (new_code != UNKNOWN)
    3993              :                 {
    3994            0 :                   cf = ct;
    3995            0 :                   ct = 0;
    3996            0 :                   code = new_code;
    3997              :                 }
    3998              :             }
    3999              : 
    4000            0 :           if (compare_code != UNKNOWN)
    4001              :             {
    4002              :               /* notl op1       (if needed)
    4003              :                  sarl $31, op1
    4004              :                  andl (cf-ct), op1
    4005              :                  addl ct, op1
    4006              : 
    4007              :                  For x < 0 (resp. x <= -1) there will be no notl,
    4008              :                  so if possible swap the constants to get rid of the
    4009              :                  complement.
    4010              :                  True/false will be -1/0 while code below (store flag
    4011              :                  followed by decrement) is 0/-1, so the constants need
    4012              :                  to be exchanged once more.  */
    4013              : 
    4014            0 :               if (compare_code == GE || !cf)
    4015              :                 {
    4016            0 :                   code = reverse_condition (code);
    4017            0 :                   compare_code = LT;
    4018              :                 }
    4019              :               else
    4020              :                 std::swap (ct, cf);
    4021              : 
    4022            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
    4023              :             }
    4024              :           else
    4025              :             {
    4026            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
    4027              : 
    4028            0 :               out = expand_simple_binop (mode, PLUS, copy_rtx (out),
    4029              :                                          constm1_rtx,
    4030              :                                          copy_rtx (out), 1, OPTAB_DIRECT);
    4031              :             }
    4032              : 
    4033            0 :           HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
    4034              :           /* Make sure we can represent the difference
    4035              :              between the two values.  */
    4036            0 :           if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
    4037              :             return false;
    4038              : 
    4039            0 :           out = expand_simple_binop (mode, AND, copy_rtx (out),
    4040            0 :                                      gen_int_mode (ival, mode),
    4041              :                                      copy_rtx (out), 1, OPTAB_DIRECT);
    4042            0 :           if (ct)
    4043            0 :             out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
    4044              :                                        copy_rtx (out), 1, OPTAB_DIRECT);
    4045            0 :           if (!rtx_equal_p (out, operands[0]))
    4046            0 :             emit_move_insn (operands[0], copy_rtx (out));
    4047              : 
    4048            0 :           return true;
    4049              :         }
    4050              :     }
    4051              : 
    4052       393368 :   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
    4053              :     {
    4054              :       /* Try a few things more with specific constants and a variable.  */
    4055              : 
    4056            0 :       optab op;
    4057            0 :       rtx var, orig_out, out, tmp;
    4058              : 
    4059            0 :       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
    4060              :         return false;
    4061              : 
    4062            0 :       operands[2] = op2;
    4063            0 :       operands[3] = op3;
    4064              : 
    4065              :       /* If one of the two operands is an interesting constant, load a
    4066              :          constant with the above and mask it in with a logical operation.  */
    4067              : 
    4068            0 :       if (CONST_INT_P (operands[2]))
    4069              :         {
    4070            0 :           var = operands[3];
    4071            0 :           if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
    4072            0 :             operands[3] = constm1_rtx, op = and_optab;
    4073            0 :           else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
    4074            0 :             operands[3] = const0_rtx, op = ior_optab;
    4075              :           else
    4076              :             return false;
    4077              :         }
    4078            0 :       else if (CONST_INT_P (operands[3]))
    4079              :         {
    4080            0 :           var = operands[2];
    4081            0 :           if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
    4082              :             {
    4083              :               /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
    4084              :                  "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
    4085            0 :               if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
    4086            0 :                 operands[1] = simplify_gen_relational (LT, VOIDmode,
    4087            0 :                                                        GET_MODE (op0),
    4088              :                                                        op0, const0_rtx);
    4089              : 
    4090            0 :               operands[2] = constm1_rtx;
    4091            0 :               op = and_optab;
    4092              :             }
    4093            0 :           else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
    4094            0 :             operands[2] = const0_rtx, op = ior_optab;
    4095              :           else
    4096              :             return false;
    4097              :         }
    4098              :       else
    4099              :         return false;
    4100              : 
    4101            0 :       orig_out = operands[0];
    4102            0 :       tmp = gen_reg_rtx (mode);
    4103            0 :       operands[0] = tmp;
    4104              : 
    4105              :       /* Recurse to get the constant loaded.  */
    4106            0 :       if (!ix86_expand_int_movcc (operands))
    4107              :         return false;
    4108              : 
    4109              :       /* Mask in the interesting variable.  */
    4110            0 :       out = expand_binop (mode, op, var, tmp, orig_out, 0,
    4111              :                           OPTAB_WIDEN);
    4112            0 :       if (!rtx_equal_p (out, orig_out))
    4113            0 :         emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
    4114              : 
    4115            0 :       return true;
    4116              :     }
    4117              : 
    4118              :   /*
    4119              :    * For comparison with above,
    4120              :    *
    4121              :    * movl cf,dest
    4122              :    * movl ct,tmp
    4123              :    * cmpl op1,op2
    4124              :    * cmovcc tmp,dest
    4125              :    *
    4126              :    * Size 15.
    4127              :    */
    4128              : 
    4129       393368 :   if (! nonimmediate_operand (operands[2], mode))
    4130        22461 :     operands[2] = force_reg (mode, operands[2]);
    4131       393368 :   if (! nonimmediate_operand (operands[3], mode))
    4132       172914 :     operands[3] = force_reg (mode, operands[3]);
    4133              : 
    4134       393368 :   if (! register_operand (operands[2], VOIDmode)
    4135       393368 :       && (mode == QImode
    4136         1093 :           || ! register_operand (operands[3], VOIDmode)))
    4137         1564 :     operands[2] = force_reg (mode, operands[2]);
    4138              : 
    4139       393368 :   if (mode == QImode
    4140       393368 :       && ! register_operand (operands[3], VOIDmode))
    4141          592 :     operands[3] = force_reg (mode, operands[3]);
    4142              : 
    4143       393368 :   emit_insn (compare_seq);
    4144       393368 :   emit_insn (gen_rtx_SET (operands[0],
    4145              :                           gen_rtx_IF_THEN_ELSE (mode,
    4146              :                                                 compare_op, operands[2],
    4147              :                                                 operands[3])));
    4148       393368 :   return true;
    4149              : }
    4150              : 
    4151              : /* Detect conditional moves that exactly match min/max operational
    4152              :    semantics.  Note that this is IEEE safe, as long as we don't
    4153              :    interchange the operands.
    4154              : 
    4155              :    Returns FALSE if this conditional move doesn't match a MIN/MAX,
    4156              :    and TRUE if the operation is successful and instructions are emitted.  */
    4157              : 
    4158              : static bool
    4159         9764 : ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
    4160              :                            rtx cmp_op1, rtx if_true, rtx if_false)
    4161              : {
    4162         9764 :   machine_mode mode = GET_MODE (dest);
    4163         9764 :   bool is_min;
    4164         9764 :   rtx tmp;
    4165              : 
    4166         9764 :   if (code == LT)
    4167              :     ;
    4168         3284 :   else if (code == LE && !HONOR_NANS (mode))
    4169              :     {
    4170              :       /* We can swap LE to GE and then invert to LT.  */
    4171              :       std::swap (cmp_op0, cmp_op1);
    4172              :       std::swap (if_true, if_false);
    4173              :     }
    4174         3243 :   else if (code == UNGE)
    4175              :     std::swap (if_true, if_false);
    4176              :   else
    4177              :     return false;
    4178              : 
    4179         8638 :   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
    4180              :     is_min = true;
    4181         4586 :   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
    4182              :     is_min = false;
    4183              :   else
    4184         1017 :     return false;
    4185              : 
    4186         7621 :   if (immediate_operand (if_false, mode))
    4187            8 :     if_false = force_reg (mode, if_false);
    4188         7621 :   if (immediate_operand (if_true, mode))
    4189            0 :     if_true = force_reg (mode, if_true);
    4190              : 
    4191              :   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
    4192              :      but MODE may be a vector mode and thus not appropriate.  */
    4193         7621 :   if (!flag_finite_math_only || flag_signed_zeros)
    4194              :     {
    4195         7621 :       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
    4196         7621 :       rtvec v;
    4197              : 
    4198         7621 :       if_true = force_reg (mode, if_true);
    4199         7621 :       v = gen_rtvec (2, if_true, if_false);
    4200         7621 :       tmp = gen_rtx_UNSPEC (mode, v, u);
    4201         7621 :     }
    4202              :   else
    4203              :     {
    4204            0 :       code = is_min ? SMIN : SMAX;
    4205            0 :       if (MEM_P (if_true) && MEM_P (if_false))
    4206            0 :         if_true = force_reg (mode, if_true);
    4207            0 :       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
    4208              :     }
    4209              : 
    4210         7621 :   emit_insn (gen_rtx_SET (dest, tmp));
    4211         7621 :   return true;
    4212              : }
    4213              : 
    4214              : /* Return true if MODE is valid for vector compare to mask register,
    4215              :    Same result for conditionl vector move with mask register.  */
    4216              : static bool
    4217        14990 : ix86_valid_mask_cmp_mode (machine_mode mode)
    4218              : {
    4219              :   /* XOP has its own vector conditional movement.  */
    4220        14990 :   if (TARGET_XOP && !TARGET_AVX512F)
    4221              :     return false;
    4222              : 
    4223              :   /* HFmode only supports vcmpsh whose dest is mask register.  */
    4224        14984 :   if (TARGET_AVX512FP16 && mode == HFmode)
    4225              :     return true;
    4226              : 
    4227              :   /* AVX512F is needed for mask operation.  */
    4228        14892 :   if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
    4229              :     return false;
    4230              : 
    4231              :   /* AVX512BW is needed for vector QI/HImode,
    4232              :      AVX512VL is needed for 128/256-bit vector.  */
    4233          182 :   machine_mode inner_mode = GET_MODE_INNER (mode);
    4234          182 :   int vector_size = GET_MODE_SIZE (mode);
    4235          182 :   if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
    4236              :     return false;
    4237              : 
    4238          162 :   return vector_size == 64 || TARGET_AVX512VL;
    4239              : }
    4240              : 
    4241              : /* Return true if integer mask comparison should be used.  */
    4242              : static bool
    4243        52748 : ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
    4244              :                      rtx op_true, rtx op_false)
    4245              : {
    4246        52748 :   int vector_size = GET_MODE_SIZE (mode);
    4247              : 
    4248        52748 :   if (cmp_mode == HFmode)
    4249              :     return true;
    4250        52656 :   else if (vector_size < 16)
    4251              :     return false;
    4252        46349 :   else if (vector_size == 64)
    4253              :     return true;
    4254        92582 :   else if (GET_MODE_INNER (cmp_mode) == HFmode)
    4255              :     return true;
    4256        92582 :   else if (GET_MODE_INNER (cmp_mode) == BFmode)
    4257              :     return true;
    4258              : 
    4259              :   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
    4260        46291 :   gcc_assert (!op_true == !op_false);
    4261              : 
    4262              :   /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
    4263              :      vector dest is required.  */
    4264        46291 :   if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
    4265              :     return false;
    4266              : 
    4267              :   /* Exclude those that could be optimized in ix86_expand_sse_movcc.  */
    4268           48 :   if (op_false == CONST0_RTX (mode)
    4269           48 :       || op_true == CONST0_RTX (mode)
    4270           48 :       || (INTEGRAL_MODE_P (mode)
    4271           40 :           && (op_true == CONSTM1_RTX (mode)
    4272           40 :               || op_false == CONSTM1_RTX (mode))))
    4273            0 :     return false;
    4274              : 
    4275              :   return true;
    4276              : }
    4277              : 
    4278              : /* Expand an SSE comparison.  Return the register with the result.  */
    4279              : 
    4280              : static rtx
    4281        35771 : ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
    4282              :                      rtx op_true, rtx op_false)
    4283              : {
    4284        35771 :   machine_mode mode = GET_MODE (dest);
    4285        35771 :   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
    4286              : 
    4287              :   /* In general case result of comparison can differ from operands' type.  */
    4288        35771 :   machine_mode cmp_mode;
    4289              : 
    4290              :   /* In AVX512F the result of comparison is an integer mask.  */
    4291        35771 :   bool maskcmp = false;
    4292        35771 :   rtx x;
    4293              : 
    4294        35771 :   if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
    4295              :     {
    4296          145 :       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
    4297          145 :       maskcmp = true;
    4298          145 :       cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
    4299              :     }
    4300              :   else
    4301              :     cmp_mode = cmp_ops_mode;
    4302              : 
    4303        35771 :   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
    4304              : 
    4305        71542 :   bool (*op1_predicate)(rtx, machine_mode)
    4306        35771 :     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
    4307              : 
    4308        35771 :   if (!op1_predicate (cmp_op1, cmp_ops_mode))
    4309            0 :     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
    4310              : 
    4311        35771 :   if (optimize
    4312          505 :       || (maskcmp && cmp_mode != mode)
    4313          505 :       || (op_true && reg_overlap_mentioned_p (dest, op_true))
    4314        36276 :       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
    4315        70387 :     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
    4316              : 
    4317        35771 :   if (maskcmp)
    4318              :     {
    4319          145 :       bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
    4320          145 :       gcc_assert (ok);
    4321              :       return dest;
    4322              :     }
    4323              : 
    4324        35626 :   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
    4325              : 
    4326        35626 :   if (cmp_mode != mode)
    4327              :     {
    4328         7197 :       x = force_reg (cmp_ops_mode, x);
    4329         7197 :       convert_move (dest, x, false);
    4330              :     }
    4331              :   else
    4332        28429 :     emit_insn (gen_rtx_SET (dest, x));
    4333              : 
    4334              :   return dest;
    4335              : }
    4336              : 
    4337              : /* Emit x86 binary operand CODE in mode MODE for SSE vector
    4338              :    instructions that can be performed using GP registers.  */
    4339              : 
    4340              : static void
    4341         7217 : ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
    4342              :                      rtx dst, rtx src1, rtx src2)
    4343              : {
    4344         7217 :   rtx tmp;
    4345              : 
    4346         7217 :   tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
    4347              : 
    4348         7217 :   if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
    4349         7217 :       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
    4350              :     {
    4351           94 :       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    4352           94 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
    4353              :     }
    4354              : 
    4355         7217 :   emit_insn (tmp);
    4356         7217 : }
    4357              : 
    4358              : /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
    4359              :    operations.  This is used for both scalar and vector conditional moves.  */
    4360              : 
    4361              : void
    4362        10299 : ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
    4363              : {
    4364        10299 :   machine_mode mode = GET_MODE (dest);
    4365        10299 :   machine_mode cmpmode = GET_MODE (cmp);
    4366        10299 :   rtx x;
    4367              : 
    4368              :   /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
    4369        10299 :   if (rtx_equal_p (op_true, op_false))
    4370              :     {
    4371            0 :       emit_move_insn (dest, op_true);
    4372            0 :       return;
    4373              :     }
    4374              : 
    4375              :   /* If we have an integer mask and FP value then we need
    4376              :      to cast mask to FP mode.  */
    4377        10299 :   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
    4378              :     {
    4379         1536 :       cmp = force_reg (cmpmode, cmp);
    4380         1536 :       cmp = gen_rtx_SUBREG (mode, cmp, 0);
    4381              :     }
    4382              : 
    4383              :   /* In AVX512F the result of comparison is an integer mask.  */
    4384        10299 :   if (mode != cmpmode
    4385         1681 :       && GET_MODE_CLASS (cmpmode) == MODE_INT)
    4386              :     {
    4387          145 :       gcc_assert (ix86_valid_mask_cmp_mode (mode));
    4388              :       /* Using scalar/vector move with mask register.  */
    4389          145 :       cmp = force_reg (cmpmode, cmp);
    4390              :       /* Optimize for mask zero.  */
    4391          290 :       op_true = (op_true != CONST0_RTX (mode)
    4392          145 :                  ? force_reg (mode, op_true) : op_true);
    4393          290 :       op_false = (op_false != CONST0_RTX (mode)
    4394          145 :                   ? force_reg (mode, op_false) : op_false);
    4395          145 :       if (op_true == CONST0_RTX (mode))
    4396              :         {
    4397            0 :           if (cmpmode == E_DImode && !TARGET_64BIT)
    4398              :             {
    4399            0 :               x = gen_reg_rtx (cmpmode);
    4400            0 :               emit_insn (gen_knotdi (x, cmp));
    4401              :             }
    4402              :           else
    4403            0 :             x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
    4404              :           cmp = x;
    4405              :           /* Reverse op_true op_false.  */
    4406              :           std::swap (op_true, op_false);
    4407              :         }
    4408              : 
    4409          145 :       if (mode == HFmode)
    4410           92 :         emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
    4411              :       else
    4412           53 :         emit_insn (gen_rtx_SET (dest,
    4413              :                                 gen_rtx_VEC_MERGE (mode,
    4414              :                                                    op_true, op_false, cmp)));
    4415          145 :       return;
    4416              :     }
    4417              : 
    4418        10154 :   if (vector_all_ones_operand (op_true, mode)
    4419        10154 :       && op_false == CONST0_RTX (mode))
    4420              :     {
    4421            2 :       emit_move_insn (dest, cmp);
    4422            2 :       return;
    4423              :     }
    4424        10152 :   else if (op_false == CONST0_RTX (mode))
    4425              :     {
    4426          902 :       x = expand_simple_binop (mode, AND, cmp, op_true,
    4427              :                                dest, 1, OPTAB_DIRECT);
    4428          902 :       if (x != dest)
    4429            0 :         emit_move_insn (dest, x);
    4430          902 :       return;
    4431              :     }
    4432         9250 :   else if (op_true == CONST0_RTX (mode))
    4433              :     {
    4434          116 :       op_false = force_reg (mode, op_false);
    4435          116 :       x = gen_rtx_NOT (mode, cmp);
    4436          116 :       ix86_emit_vec_binop (AND, mode, dest, x, op_false);
    4437          116 :       return;
    4438              :     }
    4439         9134 :   else if (vector_all_ones_operand (op_true, mode))
    4440              :     {
    4441            2 :       x = expand_simple_binop (mode, IOR, cmp, op_false,
    4442              :                                dest, 1, OPTAB_DIRECT);
    4443            2 :       if (x != dest)
    4444            0 :         emit_move_insn (dest, x);
    4445            2 :       return;
    4446              :     }
    4447              : 
    4448         9132 :   if (TARGET_XOP)
    4449              :     {
    4450           65 :       op_true = force_reg (mode, op_true);
    4451              : 
    4452           65 :       if (GET_MODE_SIZE (mode) < 16
    4453           65 :           || !nonimmediate_operand (op_false, mode))
    4454           49 :         op_false = force_reg (mode, op_false);
    4455              : 
    4456           65 :       emit_insn (gen_rtx_SET (dest,
    4457              :                               gen_rtx_IF_THEN_ELSE (mode, cmp,
    4458              :                                                     op_true, op_false)));
    4459           65 :       return;
    4460              :     }
    4461              : 
    4462         9067 :   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
    4463         9067 :   machine_mode blend_mode = mode;
    4464              : 
    4465         9067 :   if (GET_MODE_SIZE (mode) < 16
    4466         9067 :       || !vector_operand (op_true, mode))
    4467         2425 :     op_true = force_reg (mode, op_true);
    4468              : 
    4469         9067 :   op_false = force_reg (mode, op_false);
    4470              : 
    4471         9067 :   switch (mode)
    4472              :     {
    4473           29 :     case E_V2SFmode:
    4474           29 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4475              :         gen = gen_mmx_blendvps;
    4476              :       break;
    4477          320 :     case E_V4SFmode:
    4478          320 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4479              :         gen = gen_sse4_1_blendvps;
    4480              :       break;
    4481          157 :     case E_V2DFmode:
    4482          157 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4483              :         gen = gen_sse4_1_blendvpd;
    4484              :       break;
    4485         1093 :     case E_SFmode:
    4486         1093 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4487              :         gen = gen_sse4_1_blendvss;
    4488              :       break;
    4489          818 :     case E_DFmode:
    4490          818 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4491              :         gen = gen_sse4_1_blendvsd;
    4492              :       break;
    4493          362 :     case E_V8QImode:
    4494          362 :     case E_V4HImode:
    4495          362 :     case E_V4HFmode:
    4496          362 :     case E_V4BFmode:
    4497          362 :     case E_V2SImode:
    4498          362 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4499              :         {
    4500              :           gen = gen_mmx_pblendvb_v8qi;
    4501              :           blend_mode = V8QImode;
    4502              :         }
    4503              :       break;
    4504           87 :     case E_V4QImode:
    4505           87 :     case E_V2HImode:
    4506           87 :     case E_V2HFmode:
    4507           87 :     case E_V2BFmode:
    4508           87 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4509              :         {
    4510              :           gen = gen_mmx_pblendvb_v4qi;
    4511              :           blend_mode = V4QImode;
    4512              :         }
    4513              :       break;
    4514           36 :     case E_V2QImode:
    4515           36 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4516              :         gen = gen_mmx_pblendvb_v2qi;
    4517              :       break;
    4518         5502 :     case E_V16QImode:
    4519         5502 :     case E_V8HImode:
    4520         5502 :     case E_V8HFmode:
    4521         5502 :     case E_V8BFmode:
    4522         5502 :     case E_V4SImode:
    4523         5502 :     case E_V2DImode:
    4524         5502 :     case E_V1TImode:
    4525         5502 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4526              :         {
    4527              :           gen = gen_sse4_1_pblendvb;
    4528              :           blend_mode = V16QImode;
    4529              :         }
    4530              :       break;
    4531           91 :     case E_V8SFmode:
    4532           91 :       if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
    4533              :         gen = gen_avx_blendvps256;
    4534              :       break;
    4535          192 :     case E_V4DFmode:
    4536          192 :       if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
    4537              :         gen = gen_avx_blendvpd256;
    4538              :       break;
    4539          380 :     case E_V32QImode:
    4540          380 :     case E_V16HImode:
    4541          380 :     case E_V16HFmode:
    4542          380 :     case E_V16BFmode:
    4543          380 :     case E_V8SImode:
    4544          380 :     case E_V4DImode:
    4545          380 :       if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
    4546              :         {
    4547              :           gen = gen_avx2_pblendvb;
    4548              :           blend_mode = V32QImode;
    4549              :         }
    4550              :       break;
    4551              : 
    4552            0 :     case E_V64QImode:
    4553            0 :       gen = gen_avx512bw_blendmv64qi;
    4554            0 :       break;
    4555            0 :     case E_V32HImode:
    4556            0 :       gen = gen_avx512bw_blendmv32hi;
    4557            0 :       break;
    4558            0 :     case E_V32HFmode:
    4559            0 :       gen = gen_avx512bw_blendmv32hf;
    4560            0 :       break;
    4561            0 :     case E_V32BFmode:
    4562            0 :       gen = gen_avx512bw_blendmv32bf;
    4563            0 :       break;
    4564            0 :     case E_V16SImode:
    4565            0 :       gen = gen_avx512f_blendmv16si;
    4566            0 :       break;
    4567            0 :     case E_V8DImode:
    4568            0 :       gen = gen_avx512f_blendmv8di;
    4569            0 :       break;
    4570            0 :     case E_V8DFmode:
    4571            0 :       gen = gen_avx512f_blendmv8df;
    4572            0 :       break;
    4573              :     case E_V16SFmode:
    4574              :       gen = gen_avx512f_blendmv16sf;
    4575              :       break;
    4576              : 
    4577              :     default:
    4578              :       break;
    4579              :     }
    4580              : 
    4581            0 :   if (gen != NULL)
    4582              :     {
    4583         2067 :       if (blend_mode == mode)
    4584              :         x = dest;
    4585              :       else
    4586              :         {
    4587         1004 :           x = gen_reg_rtx (blend_mode);
    4588         1004 :           op_false = gen_lowpart (blend_mode, op_false);
    4589         1004 :           op_true = gen_lowpart (blend_mode, op_true);
    4590         1004 :           cmp = gen_lowpart (blend_mode, cmp);
    4591              :         }
    4592              : 
    4593         2067 :       emit_insn (gen (x, op_false, op_true, cmp));
    4594              : 
    4595         2067 :       if (x != dest)
    4596         1004 :         emit_move_insn (dest, gen_lowpart (mode, x));
    4597              :     }
    4598              :   else
    4599              :     {
    4600         7000 :       rtx t2, t3;
    4601              : 
    4602         7000 :       t2 = expand_simple_binop (mode, AND, op_true, cmp,
    4603              :                                 NULL, 1, OPTAB_DIRECT);
    4604              : 
    4605         7000 :       t3 = gen_reg_rtx (mode);
    4606         7000 :       x = gen_rtx_NOT (mode, cmp);
    4607         7000 :       ix86_emit_vec_binop (AND, mode, t3, x, op_false);
    4608              : 
    4609         7000 :       x = expand_simple_binop (mode, IOR, t3, t2,
    4610              :                                dest, 1, OPTAB_DIRECT);
    4611         7000 :       if (x != dest)
    4612            0 :         emit_move_insn (dest, x);
    4613              :     }
    4614              : }
    4615              : 
    4616              : /* Swap, force into registers, or otherwise massage the two operands
    4617              :    to an sse comparison with a mask result.  Thus we differ a bit from
    4618              :    ix86_prepare_fp_compare_args which expects to produce a flags result.
    4619              : 
    4620              :    The DEST operand exists to help determine whether to commute commutative
    4621              :    operators.  The POP0/POP1 operands are updated in place.  The new
    4622              :    comparison code is returned, or UNKNOWN if not implementable.  */
    4623              : 
    4624              : static enum rtx_code
    4625        16961 : ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
    4626              :                                   rtx *pop0, rtx *pop1)
    4627              : {
    4628        16961 :   switch (code)
    4629              :     {
    4630           67 :     case LTGT:
    4631           67 :     case UNEQ:
    4632              :       /* AVX supports all the needed comparisons.  */
    4633           67 :       if (TARGET_AVX)
    4634              :         break;
    4635              :       /* We have no LTGT as an operator.  We could implement it with
    4636              :          NE & ORDERED, but this requires an extra temporary.  It's
    4637              :          not clear that it's worth it.  */
    4638              :       return UNKNOWN;
    4639              : 
    4640              :     case LT:
    4641              :     case LE:
    4642              :     case UNGT:
    4643              :     case UNGE:
    4644              :       /* These are supported directly.  */
    4645              :       break;
    4646              : 
    4647         5350 :     case EQ:
    4648         5350 :     case NE:
    4649         5350 :     case UNORDERED:
    4650         5350 :     case ORDERED:
    4651              :       /* AVX has 3 operand comparisons, no need to swap anything.  */
    4652         5350 :       if (TARGET_AVX)
    4653              :         break;
    4654              :       /* For commutative operators, try to canonicalize the destination
    4655              :          operand to be first in the comparison - this helps reload to
    4656              :          avoid extra moves.  */
    4657          771 :       if (!dest || !rtx_equal_p (dest, *pop1))
    4658              :         break;
    4659              :       /* FALLTHRU */
    4660              : 
    4661        10562 :     case GE:
    4662        10562 :     case GT:
    4663        10562 :     case UNLE:
    4664        10562 :     case UNLT:
    4665              :       /* These are not supported directly before AVX, and furthermore
    4666              :          ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
    4667              :          comparison operands to transform into something that is
    4668              :          supported.  */
    4669        10562 :       std::swap (*pop0, *pop1);
    4670        10562 :       code = swap_condition (code);
    4671        10562 :       break;
    4672              : 
    4673            0 :     default:
    4674            0 :       gcc_unreachable ();
    4675              :     }
    4676              : 
    4677              :   return code;
    4678              : }
    4679              : 
    4680              : /* Expand a floating-point conditional move.  Return true if successful.  */
    4681              : 
    4682              : bool
    4683        95972 : ix86_expand_fp_movcc (rtx operands[])
    4684              : {
    4685        95972 :   machine_mode mode = GET_MODE (operands[0]);
    4686        95972 :   enum rtx_code code = GET_CODE (operands[1]);
    4687        95972 :   rtx tmp, compare_op;
    4688        95972 :   rtx op0 = XEXP (operands[1], 0);
    4689        95972 :   rtx op1 = XEXP (operands[1], 1);
    4690              : 
    4691        95972 :   if (GET_MODE (op0) == BFmode
    4692        95972 :       && !ix86_fp_comparison_operator (operands[1], VOIDmode))
    4693              :     return false;
    4694              : 
    4695        95972 :   if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
    4696              :     {
    4697        65458 :       machine_mode cmode;
    4698              : 
    4699              :       /* Since we've no cmove for sse registers, don't force bad register
    4700              :          allocation just to gain access to it.  Deny movcc when the
    4701              :          comparison mode doesn't match the move mode.  */
    4702        65458 :       cmode = GET_MODE (op0);
    4703        65458 :       if (cmode == VOIDmode)
    4704            0 :         cmode = GET_MODE (op1);
    4705        65458 :       if (cmode != mode)
    4706              :         return false;
    4707              : 
    4708         9784 :       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
    4709         9784 :       if (code == UNKNOWN)
    4710              :         return false;
    4711              : 
    4712         9764 :       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
    4713              :                                      operands[2], operands[3]))
    4714              :         return true;
    4715              : 
    4716         2143 :       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
    4717              :                                  operands[2], operands[3]);
    4718         2143 :       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
    4719         2143 :       return true;
    4720              :     }
    4721              : 
    4722        30514 :   if (GET_MODE (op0) == TImode
    4723        30514 :       || (GET_MODE (op0) == DImode
    4724           72 :           && !TARGET_64BIT))
    4725              :     return false;
    4726              : 
    4727              :   /* The floating point conditional move instructions don't directly
    4728              :      support conditions resulting from a signed integer comparison.  */
    4729              : 
    4730        30442 :   compare_op = ix86_expand_compare (code, op0, op1);
    4731        30442 :   if (!fcmov_comparison_operator (compare_op, VOIDmode))
    4732              :     {
    4733          146 :       tmp = gen_reg_rtx (QImode);
    4734          146 :       ix86_expand_setcc (tmp, code, op0, op1);
    4735              : 
    4736          146 :       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
    4737              :     }
    4738              : 
    4739        30442 :   operands[2] = force_reg (mode, operands[2]);
    4740        30442 :   operands[3] = force_reg (mode, operands[3]);
    4741        30442 :   emit_insn (gen_rtx_SET (operands[0],
    4742              :                           gen_rtx_IF_THEN_ELSE (mode, compare_op,
    4743              :                                                 operands[2], operands[3])));
    4744              : 
    4745        30442 :   return true;
    4746              : }
    4747              : 
    4748              : /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
    4749              : 
    4750              : static int
    4751         4854 : ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
    4752              : {
    4753         4854 :   switch (code)
    4754              :     {
    4755              :     case EQ:
    4756              :       return 0;
    4757          377 :     case LT:
    4758          377 :     case LTU:
    4759          377 :       return 1;
    4760          212 :     case LE:
    4761          212 :     case LEU:
    4762          212 :       return 2;
    4763         3051 :     case NE:
    4764         3051 :       return 4;
    4765          307 :     case GE:
    4766          307 :     case GEU:
    4767          307 :       return 5;
    4768          498 :     case GT:
    4769          498 :     case GTU:
    4770          498 :       return 6;
    4771            0 :     default:
    4772            0 :       gcc_unreachable ();
    4773              :     }
    4774              : }
    4775              : 
    4776              : /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
    4777              : 
    4778              : static int
    4779         1781 : ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
    4780              : {
    4781         1781 :   switch (code)
    4782              :     {
    4783              :     case EQ:
    4784              :       return 0x00;
    4785          354 :     case NE:
    4786          354 :       return 0x04;
    4787          514 :     case GT:
    4788          514 :       return 0x0e;
    4789           88 :     case LE:
    4790           88 :       return 0x02;
    4791           53 :     case GE:
    4792           53 :       return 0x0d;
    4793          620 :     case LT:
    4794          620 :       return 0x01;
    4795            2 :     case UNLE:
    4796            2 :       return 0x0a;
    4797            2 :     case UNLT:
    4798            2 :       return 0x09;
    4799           11 :     case UNGE:
    4800           11 :       return 0x05;
    4801           44 :     case UNGT:
    4802           44 :       return 0x06;
    4803            2 :     case UNEQ:
    4804            2 :       return 0x18;
    4805            0 :     case LTGT:
    4806            0 :       return 0x0c;
    4807            2 :     case ORDERED:
    4808            2 :       return 0x07;
    4809            2 :     case UNORDERED:
    4810            2 :       return 0x03;
    4811            0 :     default:
    4812            0 :       gcc_unreachable ();
    4813              :     }
    4814              : }
    4815              : 
    4816              : /* Return immediate value to be used in UNSPEC_PCMP
    4817              :    for comparison CODE in MODE.  */
    4818              : 
    4819              : static int
    4820         6635 : ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
    4821              : {
    4822         6635 :   if (FLOAT_MODE_P (mode))
    4823         1781 :     return ix86_fp_cmp_code_to_pcmp_immediate (code);
    4824         4854 :   return ix86_int_cmp_code_to_pcmp_immediate (code);
    4825              : }
    4826              : 
    4827              : /* Expand AVX-512 vector comparison.  */
    4828              : 
    4829              : bool
    4830         6635 : ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
    4831              : {
    4832         6635 :   machine_mode mask_mode = GET_MODE (dest);
    4833         6635 :   machine_mode cmp_mode = GET_MODE (cmp_op0);
    4834         6635 :   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
    4835         6635 :   int unspec_code;
    4836         6635 :   rtx unspec;
    4837              : 
    4838         6635 :   switch (code)
    4839              :     {
    4840              :     case LEU:
    4841              :     case GTU:
    4842              :     case GEU:
    4843              :     case LTU:
    4844              :       unspec_code = UNSPEC_UNSIGNED_PCMP;
    4845              :       break;
    4846              : 
    4847         6221 :     default:
    4848         6221 :       unspec_code = UNSPEC_PCMP;
    4849              :     }
    4850              : 
    4851         6635 :   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
    4852              :                            unspec_code);
    4853         6635 :   emit_insn (gen_rtx_SET (dest, unspec));
    4854              : 
    4855         6635 :   return true;
    4856              : }
    4857              : 
    4858              : /* Expand fp vector comparison.  */
    4859              : 
    4860              : bool
    4861         7177 : ix86_expand_fp_vec_cmp (rtx operands[])
    4862              : {
    4863         7177 :   enum rtx_code code = GET_CODE (operands[1]);
    4864         7177 :   rtx cmp;
    4865              : 
    4866         7177 :   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
    4867              :                                            &operands[2], &operands[3]);
    4868         7177 :   if (code == UNKNOWN)
    4869              :     {
    4870           20 :       rtx temp;
    4871           20 :       switch (GET_CODE (operands[1]))
    4872              :         {
    4873            2 :         case LTGT:
    4874            2 :           temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
    4875              :                                       operands[3], NULL, NULL);
    4876            2 :           cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
    4877              :                                      operands[3], NULL, NULL);
    4878            2 :           code = AND;
    4879            2 :           break;
    4880           18 :         case UNEQ:
    4881           18 :           temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
    4882              :                                       operands[3], NULL, NULL);
    4883           18 :           cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
    4884              :                                      operands[3], NULL, NULL);
    4885           18 :           code = IOR;
    4886           18 :           break;
    4887            0 :         default:
    4888            0 :           gcc_unreachable ();
    4889              :         }
    4890           20 :       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
    4891              :                                  OPTAB_DIRECT);
    4892              :     }
    4893              :   else
    4894         7157 :     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
    4895              :                                NULL, NULL);
    4896              : 
    4897         7177 :   if (operands[0] != cmp)
    4898         7094 :     emit_move_insn (operands[0], cmp);
    4899              : 
    4900         7177 :   return true;
    4901              : }
    4902              : 
    4903              : static rtx
    4904        17175 : ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
    4905              :                          rtx op_true, rtx op_false, bool *negate)
    4906              : {
    4907        17175 :   machine_mode data_mode = GET_MODE (dest);
    4908        17175 :   machine_mode mode = GET_MODE (cop0);
    4909        17175 :   rtx x;
    4910              : 
    4911        17175 :   *negate = false;
    4912              : 
    4913              :   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
    4914        17175 :   if (TARGET_XOP
    4915          201 :       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
    4916        17376 :       && GET_MODE_SIZE (mode) <= 16)
    4917              :     ;
    4918              :   /* AVX512F supports all of the comparsions
    4919              :      on all 128/256/512-bit vector int types.  */
    4920        16977 :   else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
    4921              :     ;
    4922              :   else
    4923              :     {
    4924              :       /* Canonicalize the comparison to EQ, GT, GTU.  */
    4925        16924 :       switch (code)
    4926              :         {
    4927              :         case EQ:
    4928              :         case GT:
    4929              :         case GTU:
    4930              :           break;
    4931              : 
    4932          842 :         case LE:
    4933          842 :         case LEU:
    4934              :           /* x <= cst can be handled as x < cst + 1 unless there is
    4935              :              wrap around in cst + 1.  */
    4936          842 :           if (CONST_VECTOR_P (cop1)
    4937         1416 :               && GET_MODE_INNER (mode) != TImode)
    4938              :             {
    4939          574 :               unsigned int n_elts = GET_MODE_NUNITS (mode), i;
    4940          574 :               machine_mode eltmode = GET_MODE_INNER (mode);
    4941         3659 :               for (i = 0; i < n_elts; ++i)
    4942              :                 {
    4943         3086 :                   rtx elt = CONST_VECTOR_ELT (cop1, i);
    4944         3086 :                   if (!CONST_INT_P (elt))
    4945              :                     break;
    4946         3086 :                   if (code == LE)
    4947              :                     {
    4948              :                       /* For LE punt if some element is signed maximum.  */
    4949         2062 :                       if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
    4950              :                           == (GET_MODE_MASK (eltmode) >> 1))
    4951              :                         break;
    4952              :                     }
    4953              :                   /* For LEU punt if some element is unsigned maximum.  */
    4954         1024 :                   else if (elt == constm1_rtx)
    4955              :                     break;
    4956              :                 }
    4957          574 :               if (i == n_elts)
    4958              :                 {
    4959          573 :                   rtvec v = rtvec_alloc (n_elts);
    4960         4230 :                   for (i = 0; i < n_elts; ++i)
    4961         3084 :                     RTVEC_ELT (v, i)
    4962         3084 :                       = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
    4963              :                                       eltmode);
    4964          573 :                   cop1 = gen_rtx_CONST_VECTOR (mode, v);
    4965          573 :                   std::swap (cop0, cop1);
    4966          573 :                   code = code == LE ? GT : GTU;
    4967              :                   break;
    4968              :                 }
    4969              :             }
    4970              :           /* FALLTHRU */
    4971         3357 :         case NE:
    4972         3357 :           code = reverse_condition (code);
    4973         3357 :           *negate = true;
    4974         3357 :           break;
    4975              : 
    4976          442 :         case GE:
    4977          442 :         case GEU:
    4978              :           /* x >= cst can be handled as x > cst - 1 unless there is
    4979              :              wrap around in cst - 1.  */
    4980          442 :           if (CONST_VECTOR_P (cop1)
    4981          651 :               && GET_MODE_INNER (mode) != TImode)
    4982              :             {
    4983          209 :               unsigned int n_elts = GET_MODE_NUNITS (mode), i;
    4984          209 :               machine_mode eltmode = GET_MODE_INNER (mode);
    4985         1453 :               for (i = 0; i < n_elts; ++i)
    4986              :                 {
    4987         1292 :                   rtx elt = CONST_VECTOR_ELT (cop1, i);
    4988         1292 :                   if (!CONST_INT_P (elt))
    4989              :                     break;
    4990         1292 :                   if (code == GE)
    4991              :                     {
    4992              :                       /* For GE punt if some element is signed minimum.  */
    4993         1244 :                       if (INTVAL (elt) < 0
    4994          136 :                           && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
    4995              :                               == 0))
    4996              :                         break;
    4997              :                     }
    4998              :                   /* For GEU punt if some element is zero.  */
    4999           48 :                   else if (elt == const0_rtx)
    5000              :                     break;
    5001              :                 }
    5002          209 :               if (i == n_elts)
    5003              :                 {
    5004          161 :                   rtvec v = rtvec_alloc (n_elts);
    5005         1566 :                   for (i = 0; i < n_elts; ++i)
    5006         1244 :                     RTVEC_ELT (v, i)
    5007         1244 :                       = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
    5008              :                                       eltmode);
    5009          161 :                   cop1 = gen_rtx_CONST_VECTOR (mode, v);
    5010          161 :                   code = code == GE ? GT : GTU;
    5011              :                   break;
    5012              :                 }
    5013              :             }
    5014          281 :           code = reverse_condition (code);
    5015          281 :           *negate = true;
    5016              :           /* FALLTHRU */
    5017              : 
    5018         1639 :         case LT:
    5019         1639 :         case LTU:
    5020         1639 :           std::swap (cop0, cop1);
    5021         1639 :           code = swap_condition (code);
    5022         1639 :           break;
    5023              : 
    5024            0 :         default:
    5025            0 :           gcc_unreachable ();
    5026              :         }
    5027              : 
    5028              :       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
    5029        16924 :       if (mode == V2DImode)
    5030              :         {
    5031          779 :           switch (code)
    5032              :             {
    5033          575 :             case EQ:
    5034              :               /* SSE4.1 supports EQ.  */
    5035          575 :               if (!TARGET_SSE4_1)
    5036        17175 :                 return NULL;
    5037              :               break;
    5038              : 
    5039          204 :             case GT:
    5040          204 :             case GTU:
    5041              :               /* SSE4.2 supports GT/GTU.  */
    5042          204 :               if (!TARGET_SSE4_2)
    5043              :                 return NULL;
    5044              :               break;
    5045              : 
    5046            0 :             default:
    5047            0 :               gcc_unreachable ();
    5048              :             }
    5049              :         }
    5050              : 
    5051        16924 :       if (CONST_VECTOR_P (cop0))
    5052         1233 :         cop0 = force_reg (mode, cop0);
    5053        15691 :       else if (CONST_VECTOR_P (cop1))
    5054         7280 :         cop1 = force_reg (mode, cop1);
    5055              : 
    5056        16924 :       rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
    5057        16924 :       rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
    5058        16924 :       if (*negate)
    5059         3638 :         std::swap (optrue, opfalse);
    5060              : 
    5061              :       /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
    5062              :          not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
    5063              :          min (x, y) == x).  While we add one instruction (the minimum),
    5064              :          we remove the need for two instructions in the negation, as the
    5065              :          result is done this way.
    5066              :          When using masks, do it for SI/DImode element types, as it is shorter
    5067              :          than the two subtractions.  */
    5068        16924 :       if ((code != EQ
    5069         7201 :            && GET_MODE_SIZE (mode) != 64
    5070         7201 :            && vector_all_ones_operand (opfalse, data_mode)
    5071          550 :            && optrue == CONST0_RTX (data_mode))
    5072        23575 :           || (code == GTU
    5073         1942 :               && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
    5074              :               /* Don't do it if not using integer masks and we'd end up with
    5075              :                  the right values in the registers though.  */
    5076          652 :               && (GET_MODE_SIZE (mode) == 64
    5077          652 :                   || !vector_all_ones_operand (optrue, data_mode)
    5078          535 :                   || opfalse != CONST0_RTX (data_mode))))
    5079              :         {
    5080          667 :           rtx (*gen) (rtx, rtx, rtx) = NULL;
    5081              : 
    5082          667 :           switch (mode)
    5083              :             {
    5084            0 :             case E_V16SImode:
    5085            0 :               gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
    5086              :               break;
    5087            0 :             case E_V8DImode:
    5088            0 :               gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
    5089            0 :               cop0 = force_reg (mode, cop0);
    5090            0 :               cop1 = force_reg (mode, cop1);
    5091            0 :               break;
    5092           24 :             case E_V32QImode:
    5093           24 :               if (TARGET_AVX2)
    5094           24 :                 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
    5095              :               break;
    5096           24 :             case E_V16HImode:
    5097           24 :               if (TARGET_AVX2)
    5098           24 :                 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
    5099              :               break;
    5100           25 :             case E_V8SImode:
    5101           25 :               if (TARGET_AVX2)
    5102           25 :                 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
    5103              :               break;
    5104           20 :             case E_V4DImode:
    5105           20 :               if (TARGET_AVX512VL)
    5106              :                 {
    5107            0 :                   gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
    5108            0 :                   cop0 = force_reg (mode, cop0);
    5109            0 :                   cop1 = force_reg (mode, cop1);
    5110              :                 }
    5111              :               break;
    5112           60 :             case E_V16QImode:
    5113           60 :               if (code == GTU && TARGET_SSE2)
    5114              :                 gen = gen_uminv16qi3;
    5115           24 :               else if (code == GT && TARGET_SSE4_1)
    5116              :                 gen = gen_sminv16qi3;
    5117              :               break;
    5118           40 :             case E_V8QImode:
    5119           40 :               if (code == GTU && TARGET_SSE2)
    5120              :                 gen = gen_uminv8qi3;
    5121           38 :               else if (code == GT && TARGET_SSE4_1)
    5122              :                 gen = gen_sminv8qi3;
    5123              :               break;
    5124           13 :             case E_V4QImode:
    5125           13 :               if (code == GTU && TARGET_SSE2)
    5126              :                 gen = gen_uminv4qi3;
    5127            2 :               else if (code == GT && TARGET_SSE4_1)
    5128              :                 gen = gen_sminv4qi3;
    5129              :               break;
    5130            8 :             case E_V2QImode:
    5131            8 :               if (code == GTU && TARGET_SSE2)
    5132              :                 gen = gen_uminv2qi3;
    5133            6 :               else if (code == GT && TARGET_SSE4_1)
    5134              :                 gen = gen_sminv2qi3;
    5135              :               break;
    5136           69 :             case E_V8HImode:
    5137           69 :               if (code == GTU && TARGET_SSE4_1)
    5138              :                 gen = gen_uminv8hi3;
    5139           59 :               else if (code == GT && TARGET_SSE2)
    5140              :                 gen = gen_sminv8hi3;
    5141              :               break;
    5142            4 :             case E_V4HImode:
    5143            4 :               if (code == GTU && TARGET_SSE4_1)
    5144              :                 gen = gen_uminv4hi3;
    5145            4 :               else if (code == GT && TARGET_SSE2)
    5146              :                 gen = gen_sminv4hi3;
    5147              :               break;
    5148           16 :             case E_V2HImode:
    5149           16 :               if (code == GTU && TARGET_SSE4_1)
    5150              :                 gen = gen_uminv2hi3;
    5151           16 :               else if (code == GT && TARGET_SSE2)
    5152              :                 gen = gen_sminv2hi3;
    5153              :               break;
    5154          239 :             case E_V4SImode:
    5155          239 :               if (TARGET_SSE4_1)
    5156           52 :                 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
    5157              :               break;
    5158          101 :             case E_V2SImode:
    5159          101 :               if (TARGET_SSE4_1)
    5160            0 :                 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
    5161              :               break;
    5162           24 :             case E_V2DImode:
    5163           24 :               if (TARGET_AVX512VL)
    5164              :                 {
    5165            0 :                   gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
    5166            0 :                   cop0 = force_reg (mode, cop0);
    5167            0 :                   cop1 = force_reg (mode, cop1);
    5168              :                 }
    5169              :               break;
    5170              :             default:
    5171              :               break;
    5172              :             }
    5173              : 
    5174            0 :           if (gen)
    5175              :             {
    5176          276 :               rtx tem = gen_reg_rtx (mode);
    5177          276 :               if (!vector_operand (cop0, mode))
    5178            0 :                 cop0 = force_reg (mode, cop0);
    5179          276 :               if (!vector_operand (cop1, mode))
    5180            0 :                 cop1 = force_reg (mode, cop1);
    5181          276 :               *negate = !*negate;
    5182          276 :               emit_insn (gen (tem, cop0, cop1));
    5183          276 :               cop1 = tem;
    5184          276 :               code = EQ;
    5185              :             }
    5186              :         }
    5187              : 
    5188              :       /* Unsigned parallel compare is not supported by the hardware.
    5189              :          Play some tricks to turn this into a signed comparison
    5190              :          against 0.  */
    5191        16924 :       if (code == GTU)
    5192              :         {
    5193         1103 :           cop0 = force_reg (mode, cop0);
    5194              : 
    5195         1103 :           switch (mode)
    5196              :             {
    5197          753 :             case E_V16SImode:
    5198          753 :             case E_V8DImode:
    5199          753 :             case E_V8SImode:
    5200          753 :             case E_V4DImode:
    5201          753 :             case E_V4SImode:
    5202          753 :             case E_V2SImode:
    5203          753 :             case E_V2DImode:
    5204          753 :                 {
    5205          753 :                   rtx t1, t2, mask;
    5206              : 
    5207              :                   /* Subtract (-(INT MAX) - 1) from both operands to make
    5208              :                      them signed.  */
    5209          753 :                   mask = ix86_build_signbit_mask (mode, true, false);
    5210          753 :                   t1 = gen_reg_rtx (mode);
    5211          753 :                   emit_insn (gen_sub3_insn (t1, cop0, mask));
    5212              : 
    5213          753 :                   t2 = gen_reg_rtx (mode);
    5214          753 :                   emit_insn (gen_sub3_insn (t2, cop1, mask));
    5215              : 
    5216          753 :                   cop0 = t1;
    5217          753 :                   cop1 = t2;
    5218          753 :                   code = GT;
    5219              :                 }
    5220          753 :               break;
    5221              : 
    5222          350 :             case E_V64QImode:
    5223          350 :             case E_V32HImode:
    5224          350 :             case E_V32QImode:
    5225          350 :             case E_V16HImode:
    5226          350 :             case E_V16QImode:
    5227          350 :             case E_V8QImode:
    5228          350 :             case E_V4QImode:
    5229          350 :             case E_V2QImode:
    5230          350 :             case E_V8HImode:
    5231          350 :             case E_V4HImode:
    5232          350 :             case E_V2HImode:
    5233              :               /* Perform a parallel unsigned saturating subtraction.  */
    5234          350 :               x = gen_reg_rtx (mode);
    5235          350 :               emit_insn (gen_rtx_SET
    5236              :                          (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
    5237          350 :               cop0 = x;
    5238          350 :               cop1 = CONST0_RTX (mode);
    5239          350 :               code = EQ;
    5240          350 :               *negate = !*negate;
    5241          350 :               break;
    5242              : 
    5243            0 :             default:
    5244            0 :               gcc_unreachable ();
    5245              :             }
    5246              :         }
    5247              :     }
    5248              : 
    5249        17175 :   if (*negate)
    5250         3650 :     std::swap (op_true, op_false);
    5251              : 
    5252        17175 :   if (CONST_VECTOR_P (cop1))
    5253          419 :     cop1 = force_reg (mode, cop1);
    5254              : 
    5255              :   /* Allow the comparison to be done in one mode, but the movcc to
    5256              :      happen in another mode.  */
    5257        17175 :   if (data_mode == mode)
    5258        17133 :     x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
    5259              :   else
    5260              :     {
    5261          126 :       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
    5262           42 :       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
    5263              :                                op_true, op_false);
    5264           42 :       if (GET_MODE (x) == mode)
    5265           24 :         x = gen_lowpart (data_mode, x);
    5266              :     }
    5267              : 
    5268              :   return x;
    5269              : }
    5270              : 
    5271              : /* Expand integer vector comparison.  */
    5272              : 
    5273              : bool
    5274        10358 : ix86_expand_int_vec_cmp (rtx operands[])
    5275              : {
    5276        10358 :   rtx_code code = GET_CODE (operands[1]);
    5277        10358 :   bool negate = false;
    5278        10358 :   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
    5279              :                                      operands[3], NULL, NULL, &negate);
    5280              : 
    5281        10358 :   if (!cmp)
    5282              :     return false;
    5283              : 
    5284        10358 :   if (negate)
    5285              :     {
    5286         3695 :       if (TARGET_AVX512F && GET_MODE_SIZE (GET_MODE (cmp)) >= 16)
    5287          106 :         cmp = gen_rtx_XOR (GET_MODE (cmp), cmp, CONSTM1_RTX (GET_MODE (cmp)));
    5288              :       else
    5289              :         {
    5290         6896 :           cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
    5291         3448 :                                          CONST0_RTX (GET_MODE (cmp)),
    5292              :                                          NULL, NULL, &negate);
    5293         3448 :           gcc_assert (!negate);
    5294              :         }
    5295              :     }
    5296              : 
    5297        10358 :   if (operands[0] != cmp)
    5298        10064 :     emit_move_insn (operands[0], cmp);
    5299              : 
    5300              :   return true;
    5301              : }
    5302              : 
    5303              : /* Expand a floating-point vector conditional move; a vcond operation
    5304              :    rather than a movcc operation.  */
    5305              : 
    5306              : bool
    5307            0 : ix86_expand_fp_vcond (rtx operands[])
    5308              : {
    5309            0 :   enum rtx_code code = GET_CODE (operands[3]);
    5310            0 :   rtx cmp;
    5311              : 
    5312            0 :   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
    5313              :                                            &operands[4], &operands[5]);
    5314            0 :   if (code == UNKNOWN)
    5315              :     {
    5316            0 :       rtx temp;
    5317            0 :       switch (GET_CODE (operands[3]))
    5318              :         {
    5319            0 :         case LTGT:
    5320            0 :           temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
    5321              :                                       operands[5], operands[0], operands[0]);
    5322            0 :           cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
    5323              :                                      operands[5], operands[1], operands[2]);
    5324            0 :           code = AND;
    5325            0 :           break;
    5326            0 :         case UNEQ:
    5327            0 :           temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
    5328              :                                       operands[5], operands[0], operands[0]);
    5329            0 :           cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
    5330              :                                      operands[5], operands[1], operands[2]);
    5331            0 :           code = IOR;
    5332            0 :           break;
    5333            0 :         default:
    5334            0 :           gcc_unreachable ();
    5335              :         }
    5336            0 :       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
    5337              :                                  OPTAB_DIRECT);
    5338            0 :       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
    5339            0 :       return true;
    5340              :     }
    5341              : 
    5342            0 :   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
    5343              :                                  operands[5], operands[1], operands[2]))
    5344              :     return true;
    5345              : 
    5346            0 :   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
    5347              :                              operands[1], operands[2]);
    5348            0 :   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
    5349            0 :   return true;
    5350              : }
    5351              : 
    5352              : /* Expand a signed/unsigned integral vector conditional move.  */
    5353              : 
    5354              : bool
    5355         3369 : ix86_expand_int_vcond (rtx operands[])
    5356              : {
    5357         3369 :   machine_mode data_mode = GET_MODE (operands[0]);
    5358         3369 :   machine_mode mode = GET_MODE (operands[4]);
    5359         3369 :   enum rtx_code code = GET_CODE (operands[3]);
    5360         3369 :   bool negate = false;
    5361         3369 :   rtx x, cop0, cop1;
    5362              : 
    5363         3369 :   cop0 = operands[4];
    5364         3369 :   cop1 = operands[5];
    5365              : 
    5366              :   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
    5367              :      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
    5368         3369 :   if ((code == LT || code == GE)
    5369            0 :       && data_mode == mode
    5370            0 :       && cop1 == CONST0_RTX (mode)
    5371            0 :       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
    5372            0 :       && GET_MODE_UNIT_SIZE (data_mode) > 1
    5373            0 :       && GET_MODE_UNIT_SIZE (data_mode) <= 8
    5374         3369 :       && (GET_MODE_SIZE (data_mode) == 16
    5375            0 :           || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
    5376              :     {
    5377            0 :       rtx negop = operands[2 - (code == LT)];
    5378            0 :       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
    5379            0 :       if (negop == CONST1_RTX (data_mode))
    5380              :         {
    5381            0 :           rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
    5382              :                                          operands[0], 1, OPTAB_DIRECT);
    5383            0 :           if (res != operands[0])
    5384            0 :             emit_move_insn (operands[0], res);
    5385            0 :           return true;
    5386              :         }
    5387            0 :       else if (GET_MODE_INNER (data_mode) != DImode
    5388            0 :                && vector_all_ones_operand (negop, data_mode))
    5389              :         {
    5390            0 :           rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
    5391              :                                          operands[0], 0, OPTAB_DIRECT);
    5392            0 :           if (res != operands[0])
    5393            0 :             emit_move_insn (operands[0], res);
    5394            0 :           return true;
    5395              :         }
    5396              :     }
    5397              : 
    5398         3369 :   if (!nonimmediate_operand (cop1, mode))
    5399          126 :     cop1 = force_reg (mode, cop1);
    5400         3369 :   if (!general_operand (operands[1], data_mode))
    5401            0 :     operands[1] = force_reg (data_mode, operands[1]);
    5402         3369 :   if (!general_operand (operands[2], data_mode))
    5403            0 :     operands[2] = force_reg (data_mode, operands[2]);
    5404              : 
    5405         3369 :   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
    5406              :                                operands[1], operands[2], &negate);
    5407              : 
    5408         3369 :   if (!x)
    5409              :     return false;
    5410              : 
    5411         3369 :   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
    5412         3369 :                          operands[2-negate]);
    5413         3369 :   return true;
    5414              : }
    5415              : 
    5416              : static bool
    5417       123810 : ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
    5418              :                               struct expand_vec_perm_d *d)
    5419              : {
    5420              :   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
    5421              :      expander, so args are either in d, or in op0, op1 etc.  */
    5422       123810 :   machine_mode mode = GET_MODE (d ? d->op0 : op0);
    5423       123810 :   machine_mode maskmode = mode;
    5424       123810 :   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
    5425              : 
    5426       123810 :   switch (mode)
    5427              :     {
    5428        23443 :     case E_V16QImode:
    5429        23443 :       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
    5430              :         gen = gen_avx512vl_vpermt2varv16qi3;
    5431              :       break;
    5432          750 :     case E_V32QImode:
    5433          750 :       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
    5434              :         gen = gen_avx512vl_vpermt2varv32qi3;
    5435              :       break;
    5436          235 :     case E_V64QImode:
    5437          235 :       if (TARGET_AVX512VBMI)
    5438              :         gen = gen_avx512bw_vpermt2varv64qi3;
    5439              :       break;
    5440        13143 :     case E_V8HImode:
    5441        13143 :       if (TARGET_AVX512VL && TARGET_AVX512BW)
    5442              :         gen = gen_avx512vl_vpermt2varv8hi3;
    5443              :       break;
    5444          758 :     case E_V16HImode:
    5445          758 :       if (TARGET_AVX512VL && TARGET_AVX512BW)
    5446              :         gen = gen_avx512vl_vpermt2varv16hi3;
    5447              :       break;
    5448          391 :     case E_V32HImode:
    5449          391 :       if (TARGET_AVX512BW)
    5450              :         gen = gen_avx512bw_vpermt2varv32hi3;
    5451              :       break;
    5452        33452 :     case E_V4SImode:
    5453        33452 :       if (TARGET_AVX512VL)
    5454              :         gen = gen_avx512vl_vpermt2varv4si3;
    5455              :       break;
    5456         1171 :     case E_V8SImode:
    5457         1171 :       if (TARGET_AVX512VL)
    5458              :         gen = gen_avx512vl_vpermt2varv8si3;
    5459              :       break;
    5460          126 :     case E_V16SImode:
    5461          126 :       if (TARGET_AVX512F)
    5462              :         gen = gen_avx512f_vpermt2varv16si3;
    5463              :       break;
    5464        10333 :     case E_V4SFmode:
    5465        10333 :       if (TARGET_AVX512VL)
    5466              :         {
    5467              :           gen = gen_avx512vl_vpermt2varv4sf3;
    5468              :           maskmode = V4SImode;
    5469              :         }
    5470              :       break;
    5471         7647 :     case E_V8SFmode:
    5472         7647 :       if (TARGET_AVX512VL)
    5473              :         {
    5474              :           gen = gen_avx512vl_vpermt2varv8sf3;
    5475              :           maskmode = V8SImode;
    5476              :         }
    5477              :       break;
    5478          239 :     case E_V16SFmode:
    5479          239 :       if (TARGET_AVX512F)
    5480              :         {
    5481              :           gen = gen_avx512f_vpermt2varv16sf3;
    5482              :           maskmode = V16SImode;
    5483              :         }
    5484              :       break;
    5485            0 :     case E_V2DImode:
    5486            0 :       if (TARGET_AVX512VL)
    5487              :         gen = gen_avx512vl_vpermt2varv2di3;
    5488              :       break;
    5489          290 :     case E_V4DImode:
    5490          290 :       if (TARGET_AVX512VL)
    5491              :         gen = gen_avx512vl_vpermt2varv4di3;
    5492              :       break;
    5493           10 :     case E_V8DImode:
    5494           10 :       if (TARGET_AVX512F)
    5495              :         gen = gen_avx512f_vpermt2varv8di3;
    5496              :       break;
    5497            0 :     case E_V2DFmode:
    5498            0 :       if (TARGET_AVX512VL)
    5499              :         {
    5500              :           gen = gen_avx512vl_vpermt2varv2df3;
    5501              :           maskmode = V2DImode;
    5502              :         }
    5503              :       break;
    5504         1960 :     case E_V4DFmode:
    5505         1960 :       if (TARGET_AVX512VL)
    5506              :         {
    5507              :           gen = gen_avx512vl_vpermt2varv4df3;
    5508              :           maskmode = V4DImode;
    5509              :         }
    5510              :       break;
    5511          202 :     case E_V8DFmode:
    5512          202 :       if (TARGET_AVX512F)
    5513              :         {
    5514              :           gen = gen_avx512f_vpermt2varv8df3;
    5515              :           maskmode = V8DImode;
    5516              :         }
    5517              :       break;
    5518              :     default:
    5519              :       break;
    5520              :     }
    5521              : 
    5522              :   if (gen == NULL)
    5523              :     return false;
    5524              : 
    5525          964 :   if (d && d->testing_p)
    5526              :     return true;
    5527              : 
    5528              :   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
    5529              :      expander, so args are either in d, or in op0, op1 etc.  */
    5530          953 :   if (d)
    5531              :     {
    5532          953 :       rtx vec[64];
    5533          953 :       target = d->target;
    5534          953 :       op0 = d->op0;
    5535          953 :       op1 = d->op1;
    5536        17421 :       for (int i = 0; i < d->nelt; ++i)
    5537        16468 :         vec[i] = GEN_INT (d->perm[i]);
    5538          953 :       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
    5539              :     }
    5540              : 
    5541          961 :   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
    5542          961 :   return true;
    5543              : }
    5544              : 
    5545              : /* Expand a variable vector permutation.  */
    5546              : 
    5547              : void
    5548           10 : ix86_expand_vec_perm (rtx operands[])
    5549              : {
    5550           10 :   rtx target = operands[0];
    5551           10 :   rtx op0 = operands[1];
    5552           10 :   rtx op1 = operands[2];
    5553           10 :   rtx mask = operands[3];
    5554           10 :   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
    5555           10 :   machine_mode mode = GET_MODE (op0);
    5556           10 :   machine_mode maskmode = GET_MODE (mask);
    5557           10 :   int w, e, i;
    5558           10 :   bool one_operand_shuffle = rtx_equal_p (op0, op1);
    5559              : 
    5560              :   /* Number of elements in the vector.  */
    5561           10 :   w = GET_MODE_NUNITS (mode);
    5562           10 :   e = GET_MODE_UNIT_SIZE (mode);
    5563           10 :   gcc_assert (w <= 64);
    5564              : 
    5565              :   /* For HF mode vector, convert it to HI using subreg.  */
    5566           20 :   if (GET_MODE_INNER (mode) == HFmode)
    5567              :     {
    5568            6 :       machine_mode orig_mode = mode;
    5569            6 :       mode = mode_for_vector (HImode, w).require ();
    5570            6 :       target = lowpart_subreg (mode, target, orig_mode);
    5571            6 :       op0 = lowpart_subreg (mode, op0, orig_mode);
    5572            6 :       op1 = lowpart_subreg (mode, op1, orig_mode);
    5573              :     }
    5574              : 
    5575           10 :   if (TARGET_AVX512F && one_operand_shuffle)
    5576              :     {
    5577            5 :       rtx (*gen) (rtx, rtx, rtx) = NULL;
    5578            5 :       switch (mode)
    5579              :         {
    5580              :         case E_V16SImode:
    5581              :           gen =gen_avx512f_permvarv16si;
    5582              :           break;
    5583            0 :         case E_V16SFmode:
    5584            0 :           gen = gen_avx512f_permvarv16sf;
    5585            0 :           break;
    5586            0 :         case E_V8DImode:
    5587            0 :           gen = gen_avx512f_permvarv8di;
    5588            0 :           break;
    5589            0 :         case E_V8DFmode:
    5590            0 :           gen = gen_avx512f_permvarv8df;
    5591            0 :           break;
    5592              :         default:
    5593              :           break;
    5594              :         }
    5595            0 :       if (gen != NULL)
    5596              :         {
    5597            0 :           emit_insn (gen (target, op0, mask));
    5598            8 :           return;
    5599              :         }
    5600              :     }
    5601              : 
    5602           10 :   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
    5603              :     return;
    5604              : 
    5605            2 :   if (TARGET_AVX2)
    5606              :     {
    5607            1 :       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
    5608              :         {
    5609              :           /* Unfortunately, the VPERMQ and VPERMPD instructions only support
    5610              :              an constant shuffle operand.  With a tiny bit of effort we can
    5611              :              use VPERMD instead.  A re-interpretation stall for V4DFmode is
    5612              :              unfortunate but there's no avoiding it.
    5613              :              Similarly for V16HImode we don't have instructions for variable
    5614              :              shuffling, while for V32QImode we can use after preparing suitable
    5615              :              masks vpshufb; vpshufb; vpermq; vpor.  */
    5616              : 
    5617              :           if (mode == V16HImode)
    5618              :             {
    5619              :               maskmode = mode = V32QImode;
    5620              :               w = 32;
    5621              :               e = 1;
    5622              :             }
    5623              :           else
    5624              :             {
    5625              :               maskmode = mode = V8SImode;
    5626              :               w = 8;
    5627              :               e = 4;
    5628              :             }
    5629            0 :           t1 = gen_reg_rtx (maskmode);
    5630              : 
    5631              :           /* Replicate the low bits of the V4DImode mask into V8SImode:
    5632              :                mask = { A B C D }
    5633              :                t1 = { A A B B C C D D }.  */
    5634            0 :           for (i = 0; i < w / 2; ++i)
    5635            0 :             vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
    5636            0 :           vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
    5637            0 :           vt = force_reg (maskmode, vt);
    5638            0 :           mask = gen_lowpart (maskmode, mask);
    5639            0 :           if (maskmode == V8SImode)
    5640            0 :             emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
    5641              :           else
    5642            0 :             emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
    5643              : 
    5644              :           /* Multiply the shuffle indicies by two.  */
    5645            0 :           t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
    5646              :                                     OPTAB_DIRECT);
    5647              : 
    5648              :           /* Add one to the odd shuffle indicies:
    5649              :                 t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
    5650            0 :           for (i = 0; i < w / 2; ++i)
    5651              :             {
    5652            0 :               vec[i * 2] = const0_rtx;
    5653            0 :               vec[i * 2 + 1] = const1_rtx;
    5654              :             }
    5655            0 :           vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
    5656            0 :           vt = validize_mem (force_const_mem (maskmode, vt));
    5657            0 :           t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
    5658              :                                     OPTAB_DIRECT);
    5659              : 
    5660              :           /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
    5661            0 :           operands[3] = mask = t1;
    5662            0 :           target = gen_reg_rtx (mode);
    5663            0 :           op0 = gen_lowpart (mode, op0);
    5664            0 :           op1 = gen_lowpart (mode, op1);
    5665              :         }
    5666              : 
    5667            1 :       switch (mode)
    5668              :         {
    5669            1 :         case E_V8SImode:
    5670              :           /* The VPERMD and VPERMPS instructions already properly ignore
    5671              :              the high bits of the shuffle elements.  No need for us to
    5672              :              perform an AND ourselves.  */
    5673            1 :           if (one_operand_shuffle)
    5674              :             {
    5675            0 :               emit_insn (gen_avx2_permvarv8si (target, op0, mask));
    5676            0 :               if (target != operands[0])
    5677            0 :                 emit_move_insn (operands[0],
    5678            0 :                                 gen_lowpart (GET_MODE (operands[0]), target));
    5679              :             }
    5680              :           else
    5681              :             {
    5682            1 :               t1 = gen_reg_rtx (V8SImode);
    5683            1 :               t2 = gen_reg_rtx (V8SImode);
    5684            1 :               emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
    5685            1 :               emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
    5686            1 :               goto merge_two;
    5687              :             }
    5688            0 :           return;
    5689              : 
    5690            0 :         case E_V8SFmode:
    5691            0 :           mask = gen_lowpart (V8SImode, mask);
    5692            0 :           if (one_operand_shuffle)
    5693            0 :             emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
    5694              :           else
    5695              :             {
    5696            0 :               t1 = gen_reg_rtx (V8SFmode);
    5697            0 :               t2 = gen_reg_rtx (V8SFmode);
    5698            0 :               emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
    5699            0 :               emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
    5700            0 :               goto merge_two;
    5701              :             }
    5702            0 :           return;
    5703              : 
    5704            0 :         case E_V4SImode:
    5705              :           /* By combining the two 128-bit input vectors into one 256-bit
    5706              :              input vector, we can use VPERMD and VPERMPS for the full
    5707              :              two-operand shuffle.  */
    5708            0 :           t1 = gen_reg_rtx (V8SImode);
    5709            0 :           t2 = gen_reg_rtx (V8SImode);
    5710            0 :           emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
    5711            0 :           emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
    5712            0 :           emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
    5713            0 :           emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
    5714            0 :           return;
    5715              : 
    5716            0 :         case E_V4SFmode:
    5717            0 :           t1 = gen_reg_rtx (V8SFmode);
    5718            0 :           t2 = gen_reg_rtx (V8SImode);
    5719            0 :           mask = gen_lowpart (V4SImode, mask);
    5720            0 :           emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
    5721            0 :           emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
    5722            0 :           emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
    5723            0 :           emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
    5724            0 :           return;
    5725              : 
    5726            0 :         case E_V32QImode:
    5727            0 :           t1 = gen_reg_rtx (V32QImode);
    5728            0 :           t2 = gen_reg_rtx (V32QImode);
    5729            0 :           t3 = gen_reg_rtx (V32QImode);
    5730            0 :           vt2 = GEN_INT (-128);
    5731            0 :           vt = gen_const_vec_duplicate (V32QImode, vt2);
    5732            0 :           vt = force_reg (V32QImode, vt);
    5733            0 :           for (i = 0; i < 32; i++)
    5734            0 :             vec[i] = i < 16 ? vt2 : const0_rtx;
    5735            0 :           vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
    5736            0 :           vt2 = force_reg (V32QImode, vt2);
    5737              :           /* From mask create two adjusted masks, which contain the same
    5738              :              bits as mask in the low 7 bits of each vector element.
    5739              :              The first mask will have the most significant bit clear
    5740              :              if it requests element from the same 128-bit lane
    5741              :              and MSB set if it requests element from the other 128-bit lane.
    5742              :              The second mask will have the opposite values of the MSB,
    5743              :              and additionally will have its 128-bit lanes swapped.
    5744              :              E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
    5745              :              t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
    5746              :              t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
    5747              :              stands for other 12 bytes.  */
    5748              :           /* The bit whether element is from the same lane or the other
    5749              :              lane is bit 4, so shift it up by 3 to the MSB position.  */
    5750            0 :           t5 = gen_reg_rtx (V4DImode);
    5751            0 :           emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
    5752              :                                     GEN_INT (3)));
    5753              :           /* Clear MSB bits from the mask just in case it had them set.  */
    5754            0 :           emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
    5755              :           /* After this t1 will have MSB set for elements from other lane.  */
    5756            0 :           emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
    5757              :           /* Clear bits other than MSB.  */
    5758            0 :           emit_insn (gen_andv32qi3 (t1, t1, vt));
    5759              :           /* Or in the lower bits from mask into t3.  */
    5760            0 :           emit_insn (gen_iorv32qi3 (t3, t1, t2));
    5761              :           /* And invert MSB bits in t1, so MSB is set for elements from the same
    5762              :              lane.  */
    5763            0 :           emit_insn (gen_xorv32qi3 (t1, t1, vt));
    5764              :           /* Swap 128-bit lanes in t3.  */
    5765            0 :           t6 = gen_reg_rtx (V4DImode);
    5766            0 :           emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
    5767              :                                           const2_rtx, GEN_INT (3),
    5768              :                                           const0_rtx, const1_rtx));
    5769              :           /* And or in the lower bits from mask into t1.  */
    5770            0 :           emit_insn (gen_iorv32qi3 (t1, t1, t2));
    5771            0 :           if (one_operand_shuffle)
    5772              :             {
    5773              :               /* Each of these shuffles will put 0s in places where
    5774              :                  element from the other 128-bit lane is needed, otherwise
    5775              :                  will shuffle in the requested value.  */
    5776            0 :               emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
    5777            0 :                                                 gen_lowpart (V32QImode, t6)));
    5778            0 :               emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
    5779              :               /* For t3 the 128-bit lanes are swapped again.  */
    5780            0 :               t7 = gen_reg_rtx (V4DImode);
    5781            0 :               emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
    5782              :                                               const2_rtx, GEN_INT (3),
    5783              :                                               const0_rtx, const1_rtx));
    5784              :               /* And oring both together leads to the result.  */
    5785            0 :               emit_insn (gen_iorv32qi3 (target, t1,
    5786            0 :                                         gen_lowpart (V32QImode, t7)));
    5787            0 :               if (target != operands[0])
    5788            0 :                 emit_move_insn (operands[0],
    5789            0 :                                 gen_lowpart (GET_MODE (operands[0]), target));
    5790            0 :               return;
    5791              :             }
    5792              : 
    5793            0 :           t4 = gen_reg_rtx (V32QImode);
    5794              :           /* Similarly to the above one_operand_shuffle code,
    5795              :              just for repeated twice for each operand.  merge_two:
    5796              :              code will merge the two results together.  */
    5797            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
    5798            0 :                                             gen_lowpart (V32QImode, t6)));
    5799            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
    5800            0 :                                             gen_lowpart (V32QImode, t6)));
    5801            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
    5802            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
    5803            0 :           t7 = gen_reg_rtx (V4DImode);
    5804            0 :           emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
    5805              :                                           const2_rtx, GEN_INT (3),
    5806              :                                           const0_rtx, const1_rtx));
    5807            0 :           t8 = gen_reg_rtx (V4DImode);
    5808            0 :           emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
    5809              :                                           const2_rtx, GEN_INT (3),
    5810              :                                           const0_rtx, const1_rtx));
    5811            0 :           emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
    5812            0 :           emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
    5813            0 :           t1 = t4;
    5814            0 :           t2 = t3;
    5815            0 :           goto merge_two;
    5816              : 
    5817            0 :         default:
    5818            0 :           gcc_assert (GET_MODE_SIZE (mode) <= 16);
    5819              :           break;
    5820              :         }
    5821              :     }
    5822              : 
    5823            1 :   if (TARGET_XOP)
    5824              :     {
    5825              :       /* The XOP VPPERM insn supports three inputs.  By ignoring the
    5826              :          one_operand_shuffle special case, we avoid creating another
    5827              :          set of constant vectors in memory.  */
    5828            0 :       one_operand_shuffle = false;
    5829              : 
    5830              :       /* mask = mask & {2*w-1, ...} */
    5831            0 :       vt = GEN_INT (2*w - 1);
    5832              :     }
    5833              :   else
    5834              :     {
    5835              :       /* mask = mask & {w-1, ...} */
    5836            1 :       vt = GEN_INT (w - 1);
    5837              :     }
    5838              : 
    5839            1 :   vt = gen_const_vec_duplicate (maskmode, vt);
    5840            1 :   mask = expand_simple_binop (maskmode, AND, mask, vt,
    5841              :                               NULL_RTX, 0, OPTAB_DIRECT);
    5842              : 
    5843              :   /* For non-QImode operations, convert the word permutation control
    5844              :      into a byte permutation control.  */
    5845            1 :   if (mode != V16QImode)
    5846              :     {
    5847            1 :       mask = expand_simple_binop (maskmode, ASHIFT, mask,
    5848            2 :                                   GEN_INT (exact_log2 (e)),
    5849              :                                   NULL_RTX, 0, OPTAB_DIRECT);
    5850              : 
    5851              :       /* Convert mask to vector of chars.  */
    5852            1 :       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
    5853              : 
    5854              :       /* Replicate each of the input bytes into byte positions:
    5855              :          (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
    5856              :          (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
    5857              :          (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
    5858           18 :       for (i = 0; i < 16; ++i)
    5859           16 :         vec[i] = GEN_INT (i/e * e);
    5860            1 :       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
    5861            1 :       vt = validize_mem (force_const_mem (V16QImode, vt));
    5862            1 :       if (TARGET_XOP)
    5863            0 :         emit_insn (gen_xop_pperm (mask, mask, mask, vt));
    5864              :       else
    5865            1 :         emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
    5866              : 
    5867              :       /* Convert it into the byte positions by doing
    5868              :          mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
    5869           17 :       for (i = 0; i < 16; ++i)
    5870           16 :         vec[i] = GEN_INT (i % e);
    5871            1 :       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
    5872            1 :       vt = validize_mem (force_const_mem (V16QImode, vt));
    5873            1 :       emit_insn (gen_addv16qi3 (mask, mask, vt));
    5874              :     }
    5875              : 
    5876              :   /* The actual shuffle operations all operate on V16QImode.  */
    5877            1 :   op0 = gen_lowpart (V16QImode, op0);
    5878            1 :   op1 = gen_lowpart (V16QImode, op1);
    5879              : 
    5880            1 :   if (TARGET_XOP)
    5881              :     {
    5882            0 :       if (GET_MODE (target) != V16QImode)
    5883            0 :         target = gen_reg_rtx (V16QImode);
    5884            0 :       emit_insn (gen_xop_pperm (target, op0, op1, mask));
    5885            0 :       if (target != operands[0])
    5886            0 :         emit_move_insn (operands[0],
    5887            0 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5888              :     }
    5889            1 :   else if (one_operand_shuffle)
    5890              :     {
    5891            1 :       if (GET_MODE (target) != V16QImode)
    5892            1 :         target = gen_reg_rtx (V16QImode);
    5893            1 :       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
    5894            1 :       if (target != operands[0])
    5895            1 :         emit_move_insn (operands[0],
    5896            1 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5897              :     }
    5898              :   else
    5899              :     {
    5900            0 :       rtx xops[6];
    5901            0 :       bool ok;
    5902              : 
    5903              :       /* Shuffle the two input vectors independently.  */
    5904            0 :       t1 = gen_reg_rtx (V16QImode);
    5905            0 :       t2 = gen_reg_rtx (V16QImode);
    5906            0 :       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
    5907            0 :       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
    5908              : 
    5909            1 :  merge_two:
    5910              :       /* Then merge them together.  The key is whether any given control
    5911              :          element contained a bit set that indicates the second word.  */
    5912            1 :       mask = operands[3];
    5913            1 :       vt = GEN_INT (w);
    5914            1 :       if (maskmode == V2DImode && !TARGET_SSE4_1)
    5915              :         {
    5916              :           /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
    5917              :              more shuffle to convert the V2DI input mask into a V4SI
    5918              :              input mask.  At which point the masking that expand_int_vcond
    5919              :              will work as desired.  */
    5920            0 :           rtx t3 = gen_reg_rtx (V4SImode);
    5921            0 :           emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
    5922              :                                         const0_rtx, const0_rtx,
    5923              :                                         const2_rtx, const2_rtx));
    5924            0 :           mask = t3;
    5925            0 :           maskmode = V4SImode;
    5926            0 :           e = w = 4;
    5927              :         }
    5928              : 
    5929            1 :       vt = gen_const_vec_duplicate (maskmode, vt);
    5930            1 :       vt = force_reg (maskmode, vt);
    5931            1 :       mask = expand_simple_binop (maskmode, AND, mask, vt,
    5932              :                                   NULL_RTX, 0, OPTAB_DIRECT);
    5933              : 
    5934            1 :       if (GET_MODE (target) != mode)
    5935            0 :         target = gen_reg_rtx (mode);
    5936            1 :       xops[0] = target;
    5937            1 :       xops[1] = gen_lowpart (mode, t2);
    5938            1 :       xops[2] = gen_lowpart (mode, t1);
    5939            1 :       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
    5940            1 :       xops[4] = mask;
    5941            1 :       xops[5] = vt;
    5942            1 :       ok = ix86_expand_int_vcond (xops);
    5943            1 :       gcc_assert (ok);
    5944            1 :       if (target != operands[0])
    5945            0 :         emit_move_insn (operands[0],
    5946            0 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5947              :     }
    5948              : }
    5949              : 
    5950              : /* Extend SRC into next wider integer vector type.  UNSIGNED_P is
    5951              :    true if we should do zero extension, else sign extension.  */
    5952              : 
    5953              : void
    5954          343 : ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
    5955              : {
    5956          343 :   machine_mode imode = GET_MODE (src);
    5957          343 :   rtx ops[3];
    5958              : 
    5959          343 :   switch (imode)
    5960              :     {
    5961          343 :     case E_V8QImode:
    5962          343 :     case E_V4QImode:
    5963          343 :     case E_V2QImode:
    5964          343 :     case E_V4HImode:
    5965          343 :     case E_V2HImode:
    5966          343 :     case E_V2SImode:
    5967          343 :       break;
    5968            0 :     default:
    5969            0 :       gcc_unreachable ();
    5970              :     }
    5971              : 
    5972          343 :   ops[0] = dest;
    5973              : 
    5974          343 :   ops[1] = force_reg (imode, src);
    5975              : 
    5976          343 :   if (unsigned_p)
    5977           97 :     ops[2] = force_reg (imode, CONST0_RTX (imode));
    5978              :   else
    5979          246 :     ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
    5980              :                                   ops[1], pc_rtx, pc_rtx);
    5981              : 
    5982          343 :   ix86_split_mmx_punpck (ops, false);
    5983          343 : }
    5984              : 
    5985              : /* Unpack SRC into the next wider integer vector type.  UNSIGNED_P is
    5986              :    true if we should do zero extension, else sign extension.  HIGH_P is
    5987              :    true if we want the N/2 high elements, else the low elements.  */
    5988              : 
    5989              : void
    5990        18845 : ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
    5991              : {
    5992        18845 :   machine_mode imode = GET_MODE (src);
    5993        18845 :   rtx tmp;
    5994              : 
    5995        18845 :   if (TARGET_SSE4_1)
    5996              :     {
    5997         6313 :       rtx (*unpack)(rtx, rtx);
    5998         6313 :       rtx (*extract)(rtx, rtx) = NULL;
    5999         6313 :       machine_mode halfmode = BLKmode;
    6000              : 
    6001         6313 :       switch (imode)
    6002              :         {
    6003          116 :         case E_V64QImode:
    6004          116 :           if (unsigned_p)
    6005              :             unpack = gen_avx512bw_zero_extendv32qiv32hi2;
    6006              :           else
    6007           62 :             unpack = gen_avx512bw_sign_extendv32qiv32hi2;
    6008          116 :           halfmode = V32QImode;
    6009          116 :           extract
    6010          116 :             = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
    6011              :           break;
    6012          697 :         case E_V32QImode:
    6013          697 :           if (unsigned_p)
    6014              :             unpack = gen_avx2_zero_extendv16qiv16hi2;
    6015              :           else
    6016          150 :             unpack = gen_avx2_sign_extendv16qiv16hi2;
    6017          697 :           halfmode = V16QImode;
    6018          697 :           extract
    6019          697 :             = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
    6020              :           break;
    6021          102 :         case E_V32HImode:
    6022          102 :           if (unsigned_p)
    6023              :             unpack = gen_avx512f_zero_extendv16hiv16si2;
    6024              :           else
    6025           60 :             unpack = gen_avx512f_sign_extendv16hiv16si2;
    6026          102 :           halfmode = V16HImode;
    6027          102 :           extract
    6028          102 :             = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
    6029              :           break;
    6030          455 :         case E_V16HImode:
    6031          455 :           if (unsigned_p)
    6032              :             unpack = gen_avx2_zero_extendv8hiv8si2;
    6033              :           else
    6034          332 :             unpack = gen_avx2_sign_extendv8hiv8si2;
    6035          455 :           halfmode = V8HImode;
    6036          455 :           extract
    6037          455 :             = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
    6038              :           break;
    6039          108 :         case E_V16SImode:
    6040          108 :           if (unsigned_p)
    6041              :             unpack = gen_avx512f_zero_extendv8siv8di2;
    6042              :           else
    6043           90 :             unpack = gen_avx512f_sign_extendv8siv8di2;
    6044          108 :           halfmode = V8SImode;
    6045          108 :           extract
    6046          108 :             = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
    6047              :           break;
    6048          396 :         case E_V8SImode:
    6049          396 :           if (unsigned_p)
    6050              :             unpack = gen_avx2_zero_extendv4siv4di2;
    6051              :           else
    6052          334 :             unpack = gen_avx2_sign_extendv4siv4di2;
    6053          396 :           halfmode = V4SImode;
    6054          396 :           extract
    6055          396 :             = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
    6056              :           break;
    6057         2558 :         case E_V16QImode:
    6058         2558 :           if (unsigned_p)
    6059              :             unpack = gen_sse4_1_zero_extendv8qiv8hi2;
    6060              :           else
    6061          257 :             unpack = gen_sse4_1_sign_extendv8qiv8hi2;
    6062              :           break;
    6063          963 :         case E_V8HImode:
    6064          963 :           if (unsigned_p)
    6065              :             unpack = gen_sse4_1_zero_extendv4hiv4si2;
    6066              :           else
    6067          750 :             unpack = gen_sse4_1_sign_extendv4hiv4si2;
    6068              :           break;
    6069          538 :         case E_V4SImode:
    6070          538 :           if (unsigned_p)
    6071              :             unpack = gen_sse4_1_zero_extendv2siv2di2;
    6072              :           else
    6073          478 :             unpack = gen_sse4_1_sign_extendv2siv2di2;
    6074              :           break;
    6075          111 :         case E_V8QImode:
    6076          111 :           if (unsigned_p)
    6077              :             unpack = gen_sse4_1_zero_extendv4qiv4hi2;
    6078              :           else
    6079           72 :             unpack = gen_sse4_1_sign_extendv4qiv4hi2;
    6080              :           break;
    6081          263 :         case E_V4HImode:
    6082          263 :           if (unsigned_p)
    6083              :             unpack = gen_sse4_1_zero_extendv2hiv2si2;
    6084              :           else
    6085          208 :             unpack = gen_sse4_1_sign_extendv2hiv2si2;
    6086              :           break;
    6087            6 :         case E_V4QImode:
    6088            6 :           if (unsigned_p)
    6089              :             unpack = gen_sse4_1_zero_extendv2qiv2hi2;
    6090              :           else
    6091            0 :             unpack = gen_sse4_1_sign_extendv2qiv2hi2;
    6092              :           break;
    6093            0 :         default:
    6094            0 :           gcc_unreachable ();
    6095              :         }
    6096              : 
    6097        12626 :       if (GET_MODE_SIZE (imode) >= 32)
    6098              :         {
    6099         1874 :           tmp = gen_reg_rtx (halfmode);
    6100         1874 :           emit_insn (extract (tmp, src));
    6101              :         }
    6102         4439 :       else if (high_p)
    6103              :         {
    6104         2280 :           switch (GET_MODE_SIZE (imode))
    6105              :             {
    6106          947 :             case 16:
    6107              :               /* Shift higher 8 bytes to lower 8 bytes.  */
    6108          947 :               tmp = gen_reg_rtx (V1TImode);
    6109          947 :               emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
    6110              :                                              GEN_INT (64)));
    6111          947 :               break;
    6112          190 :             case 8:
    6113              :               /* Shift higher 4 bytes to lower 4 bytes.  */
    6114          190 :               tmp = gen_reg_rtx (V1DImode);
    6115          190 :               emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
    6116              :                                             GEN_INT (32)));
    6117          190 :               break;
    6118            3 :             case 4:
    6119              :               /* Shift higher 2 bytes to lower 2 bytes.  */
    6120            3 :               tmp = gen_reg_rtx (V1SImode);
    6121            3 :               emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
    6122              :                                             GEN_INT (16)));
    6123            3 :               break;
    6124            0 :             default:
    6125            0 :               gcc_unreachable ();
    6126              :             }
    6127              : 
    6128         1140 :           tmp = gen_lowpart (imode, tmp);
    6129              :         }
    6130              :       else
    6131              :         tmp = src;
    6132              : 
    6133         6313 :       emit_insn (unpack (dest, tmp));
    6134              :     }
    6135              :   else
    6136              :     {
    6137        12532 :       rtx (*unpack)(rtx, rtx, rtx);
    6138              : 
    6139        12532 :       switch (imode)
    6140              :         {
    6141         3378 :         case E_V16QImode:
    6142         3378 :           if (high_p)
    6143              :             unpack = gen_vec_interleave_highv16qi;
    6144              :           else
    6145         1692 :             unpack = gen_vec_interleave_lowv16qi;
    6146              :           break;
    6147         5174 :         case E_V8HImode:
    6148         5174 :           if (high_p)
    6149              :             unpack = gen_vec_interleave_highv8hi;
    6150              :           else
    6151         2587 :             unpack = gen_vec_interleave_lowv8hi;
    6152              :           break;
    6153         2352 :         case E_V4SImode:
    6154         2352 :           if (high_p)
    6155              :             unpack = gen_vec_interleave_highv4si;
    6156              :           else
    6157         1176 :             unpack = gen_vec_interleave_lowv4si;
    6158              :           break;
    6159          556 :         case E_V8QImode:
    6160          556 :           if (high_p)
    6161              :             unpack = gen_mmx_punpckhbw;
    6162              :           else
    6163          278 :             unpack = gen_mmx_punpcklbw;
    6164              :           break;
    6165         1058 :         case E_V4HImode:
    6166         1058 :           if (high_p)
    6167              :             unpack = gen_mmx_punpckhwd;
    6168              :           else
    6169          529 :             unpack = gen_mmx_punpcklwd;
    6170              :           break;
    6171           14 :         case E_V4QImode:
    6172           14 :           if (high_p)
    6173              :             unpack = gen_mmx_punpckhbw_low;
    6174              :           else
    6175            7 :             unpack = gen_mmx_punpcklbw_low;
    6176              :           break;
    6177            0 :         default:
    6178            0 :           gcc_unreachable ();
    6179              :         }
    6180              : 
    6181        12532 :       if (unsigned_p)
    6182         4894 :         tmp = force_reg (imode, CONST0_RTX (imode));
    6183              :       else
    6184         7638 :         tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
    6185              :                                    src, pc_rtx, pc_rtx);
    6186              : 
    6187        12532 :       rtx tmp2 = gen_reg_rtx (imode);
    6188        12532 :       emit_insn (unpack (tmp2, src, tmp));
    6189        12532 :       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
    6190              :     }
    6191        18845 : }
    6192              : 
    6193              : /* Return true if mem is pool constant which contains a const_vector
    6194              :    perm index, assign the index to PERM.  */
    6195              : bool
    6196           35 : ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
    6197              : {
    6198           35 :   machine_mode mode = GET_MODE (mem);
    6199           35 :   int nelt = GET_MODE_NUNITS (mode);
    6200              : 
    6201           35 :   if (!INTEGRAL_MODE_P (mode))
    6202              :     return false;
    6203              : 
    6204              :     /* Needs to be constant pool.  */
    6205           35 :   if (!(MEM_P (mem))
    6206           35 :       || !SYMBOL_REF_P (XEXP (mem, 0))
    6207           70 :       || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
    6208              :    return false;
    6209              : 
    6210           35 :   rtx constant = get_pool_constant (XEXP (mem, 0));
    6211              : 
    6212           35 :   if (!CONST_VECTOR_P (constant))
    6213              :     return false;
    6214              : 
    6215              :   /* There could be some rtx like
    6216              :      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
    6217              :      but with "*.LC1" refer to V2DI constant vector.  */
    6218           35 :   if (GET_MODE (constant) != mode)
    6219              :     {
    6220            0 :       constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
    6221              : 
    6222            0 :       if (constant == nullptr || !CONST_VECTOR_P (constant))
    6223              :         return false;
    6224              :     }
    6225              : 
    6226          771 :   for (int i = 0; i != nelt; i++)
    6227          736 :     perm[i] = UINTVAL (XVECEXP (constant, 0, i));
    6228              : 
    6229              :   return true;
    6230              : }
    6231              : 
    6232              : /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
    6233              :    but works for floating pointer parameters and nonoffsetable memories.
    6234              :    For pushes, it returns just stack offsets; the values will be saved
    6235              :    in the right order.  Maximally three parts are generated.  */
    6236              : 
    6237              : static int
    6238      4124152 : ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
    6239              : {
    6240      4124152 :   int size;
    6241              : 
    6242      4124152 :   if (!TARGET_64BIT)
    6243      1565994 :     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
    6244              :   else
    6245      6681000 :     size = (GET_MODE_SIZE (mode) + 4) / 8;
    6246              : 
    6247      4124152 :   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
    6248      4124152 :   gcc_assert (size >= 2 && size <= 4);
    6249              : 
    6250              :   /* Optimize constant pool reference to immediates.  This is used by fp
    6251              :      moves, that force all constants to memory to allow combining.  */
    6252      4124152 :   if (MEM_P (operand) && MEM_READONLY_P (operand))
    6253        37117 :     operand = avoid_constant_pool_reference (operand);
    6254              : 
    6255      4124152 :   if (MEM_P (operand) && !offsettable_memref_p (operand))
    6256              :     {
    6257              :       /* The only non-offsetable memories we handle are pushes.  */
    6258       184034 :       int ok = push_operand (operand, VOIDmode);
    6259              : 
    6260       184034 :       gcc_assert (ok);
    6261              : 
    6262       184034 :       operand = copy_rtx (operand);
    6263       184034 :       PUT_MODE (operand, word_mode);
    6264       184034 :       parts[0] = parts[1] = parts[2] = parts[3] = operand;
    6265       184034 :       return size;
    6266              :     }
    6267              : 
    6268      3940118 :   if (CONST_VECTOR_P (operand))
    6269              :     {
    6270        41917 :       scalar_int_mode imode = int_mode_for_mode (mode).require ();
    6271              :       /* Caution: if we looked through a constant pool memory above,
    6272              :          the operand may actually have a different mode now.  That's
    6273              :          ok, since we want to pun this all the way back to an integer.  */
    6274        41917 :       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
    6275        41917 :       gcc_assert (operand != NULL);
    6276        41917 :       mode = imode;
    6277              :     }
    6278              : 
    6279      3940118 :   if (!TARGET_64BIT)
    6280              :     {
    6281       625028 :       if (mode == DImode)
    6282       495241 :         split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
    6283              :       else
    6284              :         {
    6285       129787 :           int i;
    6286              : 
    6287       129787 :           if (REG_P (operand))
    6288              :             {
    6289        67371 :               gcc_assert (reload_completed);
    6290       202113 :               for (i = 0; i < size; i++)
    6291       134742 :                 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
    6292              :             }
    6293        62416 :           else if (offsettable_memref_p (operand))
    6294              :             {
    6295        61139 :               operand = adjust_address (operand, SImode, 0);
    6296        61139 :               parts[0] = operand;
    6297       122804 :               for (i = 1; i < size; i++)
    6298        61665 :                 parts[i] = adjust_address (operand, SImode, 4 * i);
    6299              :             }
    6300         1277 :           else if (CONST_DOUBLE_P (operand))
    6301              :             {
    6302         1277 :               const REAL_VALUE_TYPE *r;
    6303         1277 :               long l[4];
    6304              : 
    6305         1277 :               r = CONST_DOUBLE_REAL_VALUE (operand);
    6306         1277 :               switch (mode)
    6307              :                 {
    6308            0 :                 case E_TFmode:
    6309            0 :                   real_to_target (l, r, mode);
    6310            0 :                   parts[3] = gen_int_mode (l[3], SImode);
    6311            0 :                   parts[2] = gen_int_mode (l[2], SImode);
    6312            0 :                   break;
    6313          204 :                 case E_XFmode:
    6314              :                   /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
    6315              :                      long double may not be 80-bit.  */
    6316          204 :                   real_to_target (l, r, mode);
    6317          204 :                   parts[2] = gen_int_mode (l[2], SImode);
    6318          204 :                   break;
    6319         1073 :                 case E_DFmode:
    6320         1073 :                   REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
    6321         1073 :                   break;
    6322            0 :                 default:
    6323            0 :                   gcc_unreachable ();
    6324              :                 }
    6325         1277 :               parts[1] = gen_int_mode (l[1], SImode);
    6326         1277 :               parts[0] = gen_int_mode (l[0], SImode);
    6327              :             }
    6328              :           else
    6329            0 :             gcc_unreachable ();
    6330              :         }
    6331              :     }
    6332              :   else
    6333              :     {
    6334      3315090 :       if (mode == TImode)
    6335      3295827 :         split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
    6336      3315090 :       if (mode == XFmode || mode == TFmode)
    6337              :         {
    6338        19263 :           machine_mode upper_mode = mode==XFmode ? SImode : DImode;
    6339        19263 :           if (REG_P (operand))
    6340              :             {
    6341         1502 :               gcc_assert (reload_completed);
    6342         1502 :               parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
    6343         1502 :               parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
    6344              :             }
    6345        17761 :           else if (offsettable_memref_p (operand))
    6346              :             {
    6347        14093 :               operand = adjust_address (operand, DImode, 0);
    6348        14093 :               parts[0] = operand;
    6349        14093 :               parts[1] = adjust_address (operand, upper_mode, 8);
    6350              :             }
    6351         3668 :           else if (CONST_DOUBLE_P (operand))
    6352              :             {
    6353         3668 :               long l[4];
    6354              : 
    6355         3668 :               real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
    6356              : 
    6357              :               /* real_to_target puts 32-bit pieces in each long.  */
    6358         7336 :               parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
    6359         3668 :                                        | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
    6360         3668 :                                           << 32), DImode);
    6361              : 
    6362         3668 :               if (upper_mode == SImode)
    6363         2734 :                 parts[1] = gen_int_mode (l[2], SImode);
    6364              :               else
    6365          934 :                 parts[1]
    6366          934 :                   = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
    6367          934 :                                   | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
    6368          934 :                                      << 32), DImode);
    6369              :             }
    6370              :           else
    6371            0 :             gcc_unreachable ();
    6372              :         }
    6373              :     }
    6374              : 
    6375              :   return size;
    6376              : }
    6377              : 
    6378              : /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
    6379              :    Return false when normal moves are needed; true when all required
    6380              :    insns have been emitted.  Operands 2-4 contain the input values
    6381              :    int the correct order; operands 5-7 contain the output values.  */
    6382              : 
    6383              : void
    6384      2074989 : ix86_split_long_move (rtx operands[])
    6385              : {
    6386      2074989 :   rtx part[2][4];
    6387      2074989 :   int nparts, i, j;
    6388      2074989 :   int push = 0;
    6389      2074989 :   int collisions = 0;
    6390      2074989 :   machine_mode mode = GET_MODE (operands[0]);
    6391      2074989 :   bool collisionparts[4];
    6392              : 
    6393              :   /* The DFmode expanders may ask us to move double.
    6394              :      For 64bit target this is single move.  By hiding the fact
    6395              :      here we simplify i386.md splitters.  */
    6396      3758152 :   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
    6397              :     {
    6398              :       /* Optimize constant pool reference to immediates.  This is used by
    6399              :          fp moves, that force all constants to memory to allow combining.  */
    6400              : 
    6401        12913 :       if (MEM_P (operands[1])
    6402        12499 :           && SYMBOL_REF_P (XEXP (operands[1], 0))
    6403        13519 :           && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
    6404          117 :         operands[1] = get_pool_constant (XEXP (operands[1], 0));
    6405        12913 :       if (push_operand (operands[0], VOIDmode))
    6406              :         {
    6407        12913 :           operands[0] = copy_rtx (operands[0]);
    6408        12913 :           PUT_MODE (operands[0], word_mode);
    6409              :         }
    6410              :       else
    6411            0 :         operands[0] = gen_lowpart (DImode, operands[0]);
    6412        12913 :       operands[1] = gen_lowpart (DImode, operands[1]);
    6413        12913 :       emit_move_insn (operands[0], operands[1]);
    6414        12913 :       return;
    6415              :     }
    6416              : 
    6417              :   /* The only non-offsettable memory we handle is push.  */
    6418      2062076 :   if (push_operand (operands[0], VOIDmode))
    6419              :     push = 1;
    6420              :   else
    6421      1878042 :     gcc_assert (!MEM_P (operands[0])
    6422              :                 || offsettable_memref_p (operands[0]));
    6423              : 
    6424      2062076 :   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
    6425      2062076 :   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
    6426              : 
    6427              :   /* When emitting push, take care for source operands on the stack.  */
    6428       184034 :   if (push && MEM_P (operands[1])
    6429      2159252 :       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
    6430              :     {
    6431        56324 :       rtx src_base = XEXP (part[1][nparts - 1], 0);
    6432              : 
    6433              :       /* Compensate for the stack decrement by 4.  */
    6434        56324 :       if (!TARGET_64BIT && nparts == 3
    6435        51632 :           && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
    6436            0 :         src_base = plus_constant (Pmode, src_base, 4);
    6437              : 
    6438              :       /* src_base refers to the stack pointer and is
    6439              :          automatically decreased by emitted push.  */
    6440       169251 :       for (i = 0; i < nparts; i++)
    6441       112927 :         part[1][i] = change_address (part[1][i],
    6442       112927 :                                      GET_MODE (part[1][i]), src_base);
    6443              :     }
    6444              : 
    6445              :   /* We need to do copy in the right order in case an address register
    6446              :      of the source overlaps the destination.  */
    6447      2062076 :   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
    6448              :     {
    6449              :       rtx tmp;
    6450              : 
    6451      2356776 :       for (i = 0; i < nparts; i++)
    6452              :         {
    6453      1571184 :           collisionparts[i]
    6454      1571184 :             = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
    6455      1571184 :           if (collisionparts[i])
    6456        16854 :             collisions++;
    6457              :         }
    6458              : 
    6459              :       /* Collision in the middle part can be handled by reordering.  */
    6460       785592 :       if (collisions == 1 && nparts == 3 && collisionparts [1])
    6461              :         {
    6462            0 :           std::swap (part[0][1], part[0][2]);
    6463            0 :           std::swap (part[1][1], part[1][2]);
    6464              :         }
    6465       785592 :       else if (collisions == 1
    6466       785592 :                && nparts == 4
    6467            0 :                && (collisionparts [1] || collisionparts [2]))
    6468              :         {
    6469            0 :           if (collisionparts [1])
    6470              :             {
    6471            0 :               std::swap (part[0][1], part[0][2]);
    6472            0 :               std::swap (part[1][1], part[1][2]);
    6473              :             }
    6474              :           else
    6475              :             {
    6476            0 :               std::swap (part[0][2], part[0][3]);
    6477            0 :               std::swap (part[1][2], part[1][3]);
    6478              :             }
    6479              :         }
    6480              : 
    6481              :       /* If there are more collisions, we can't handle it by reordering.
    6482              :          Do an lea to the last part and use only one colliding move.  */
    6483       785592 :       else if (collisions > 1)
    6484              :         {
    6485           83 :           rtx base, addr;
    6486              : 
    6487           83 :           collisions = 1;
    6488              : 
    6489           83 :           base = part[0][nparts - 1];
    6490              : 
    6491              :           /* Handle the case when the last part isn't valid for lea.
    6492              :              Happens in 64-bit mode storing the 12-byte XFmode.  */
    6493          124 :           if (GET_MODE (base) != Pmode)
    6494            0 :             base = gen_rtx_REG (Pmode, REGNO (base));
    6495              : 
    6496           83 :           addr = XEXP (part[1][0], 0);
    6497           83 :           if (TARGET_TLS_DIRECT_SEG_REFS)
    6498              :             {
    6499           83 :               struct ix86_address parts;
    6500           83 :               int ok = ix86_decompose_address (addr, &parts);
    6501           83 :               gcc_assert (ok);
    6502              :               /* It is not valid to use %gs: or %fs: in lea.  */
    6503           83 :               gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
    6504              :             }
    6505           83 :           emit_insn (gen_rtx_SET (base, addr));
    6506           83 :           part[1][0] = replace_equiv_address (part[1][0], base);
    6507          166 :           for (i = 1; i < nparts; i++)
    6508              :             {
    6509          165 :               tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
    6510           83 :               part[1][i] = replace_equiv_address (part[1][i], tmp);
    6511              :             }
    6512              :         }
    6513              :     }
    6514              : 
    6515      2062076 :   if (push)
    6516              :     {
    6517       184034 :       if (!TARGET_64BIT)
    6518              :         {
    6519       158624 :           if (nparts == 3)
    6520              :             {
    6521          580 :               if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
    6522            0 :                 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
    6523          580 :               emit_move_insn (part[0][2], part[1][2]);
    6524              :             }
    6525       158044 :           else if (nparts == 4)
    6526              :             {
    6527            0 :               emit_move_insn (part[0][3], part[1][3]);
    6528            0 :               emit_move_insn (part[0][2], part[1][2]);
    6529              :             }
    6530              :         }
    6531              :       else
    6532              :         {
    6533              :           /* In 64bit mode we don't have 32bit push available.  In case this is
    6534              :              register, it is OK - we will just use larger counterpart.  We also
    6535              :              retype memory - these comes from attempt to avoid REX prefix on
    6536              :              moving of second half of TFmode value.  */
    6537        25410 :           if (GET_MODE (part[1][1]) == SImode)
    6538              :             {
    6539        11268 :               switch (GET_CODE (part[1][1]))
    6540              :                 {
    6541        10826 :                 case MEM:
    6542        10826 :                   part[1][1] = adjust_address (part[1][1], DImode, 0);
    6543        10826 :                   break;
    6544              : 
    6545          442 :                 case REG:
    6546          442 :                   part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
    6547          442 :                   break;
    6548              : 
    6549            0 :                 default:
    6550            0 :                   gcc_unreachable ();
    6551              :                 }
    6552              : 
    6553        11268 :               if (GET_MODE (part[1][0]) == SImode)
    6554            0 :                 part[1][0] = part[1][1];
    6555              :             }
    6556              :         }
    6557       184034 :       emit_move_insn (part[0][1], part[1][1]);
    6558       184034 :       emit_move_insn (part[0][0], part[1][0]);
    6559       184034 :       return;
    6560              :     }
    6561              : 
    6562              :   /* Choose correct order to not overwrite the source before it is copied.  */
    6563      1878042 :   if ((REG_P (part[0][0])
    6564      1023141 :        && REG_P (part[1][1])
    6565        80249 :        && (REGNO (part[0][0]) == REGNO (part[1][1])
    6566        65070 :            || (nparts == 3
    6567            0 :                && REGNO (part[0][0]) == REGNO (part[1][2]))
    6568        65070 :            || (nparts == 4
    6569            0 :                && REGNO (part[0][0]) == REGNO (part[1][3]))))
    6570      2886004 :       || (collisions > 0
    6571        16771 :           && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
    6572              :     {
    6573        95064 :       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
    6574              :         {
    6575        63376 :           operands[2 + i] = part[0][j];
    6576        63376 :           operands[6 + i] = part[1][j];
    6577              :         }
    6578              :     }
    6579              :   else
    6580              :     {
    6581      5539137 :       for (i = 0; i < nparts; i++)
    6582              :         {
    6583      3692783 :           operands[2 + i] = part[0][i];
    6584      3692783 :           operands[6 + i] = part[1][i];
    6585              :         }
    6586              :     }
    6587              : 
    6588              :   /* Attempt to locally unCSE nonzero constants.  */
    6589      3756159 :   for (j = 0; j < nparts - 1; j++)
    6590      1878117 :     if (CONST_INT_P (operands[6 + j])
    6591       223999 :         && operands[6 + j] != const0_rtx
    6592        62867 :         && REG_P (operands[2 + j]))
    6593       111878 :       for (i = j; i < nparts - 1; i++)
    6594        55939 :         if (CONST_INT_P (operands[7 + i])
    6595        55939 :             && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
    6596        22570 :           operands[7 + i] = operands[2 + j];
    6597              : 
    6598      5634201 :   for (i = 0; i < nparts; i++)
    6599      3756159 :     emit_move_insn (operands[2 + i], operands[6 + i]);
    6600              : 
    6601              :   return;
    6602              : }
    6603              : 
    6604              : /* Helper function of ix86_split_ashl used to generate an SImode/DImode
    6605              :    left shift by a constant, either using a single shift or
    6606              :    a sequence of add instructions.  */
    6607              : 
    6608              : static void
    6609         4343 : ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
    6610              : {
    6611         4343 :   if (count == 1
    6612         4343 :       || (count * ix86_cost->add <= ix86_cost->shift_const
    6613            0 :           && !optimize_insn_for_size_p ()))
    6614              :     {
    6615           16 :       while (count-- > 0)
    6616            8 :         emit_insn (gen_add2_insn (operand, operand));
    6617              :     }
    6618              :   else
    6619              :     {
    6620         4335 :       rtx (*insn)(rtx, rtx, rtx);
    6621              : 
    6622         4335 :       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
    6623         4335 :       emit_insn (insn (operand, operand, GEN_INT (count)));
    6624              :     }
    6625         4343 : }
    6626              : 
    6627              : void
    6628        10220 : ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
    6629              : {
    6630        10220 :   rtx (*gen_ashl3)(rtx, rtx, rtx);
    6631        10220 :   rtx (*gen_shld)(rtx, rtx, rtx);
    6632        10220 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6633        10220 :   machine_mode half_mode;
    6634              : 
    6635        10220 :   rtx low[2], high[2];
    6636        10220 :   int count;
    6637              : 
    6638        10220 :   if (CONST_INT_P (operands[2]))
    6639              :     {
    6640         8523 :       split_double_mode (mode, operands, 2, low, high);
    6641         8523 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6642              : 
    6643         8523 :       if (count >= half_width)
    6644              :         {
    6645         2464 :           emit_move_insn (high[0], low[1]);
    6646         2464 :           ix86_expand_clear (low[0]);
    6647              : 
    6648         2464 :           if (count > half_width)
    6649          141 :             ix86_expand_ashl_const (high[0], count - half_width, mode);
    6650              :         }
    6651         6059 :       else if (count == 1)
    6652              :         {
    6653         1857 :           if (!rtx_equal_p (operands[0], operands[1]))
    6654            0 :             emit_move_insn (operands[0], operands[1]);
    6655         1857 :           rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
    6656         1857 :           rtx x4 = gen_rtx_LTU (mode, x3, const0_rtx);
    6657         1857 :           half_mode = mode == DImode ? SImode : DImode;
    6658         1857 :           emit_insn (gen_add3_cc_overflow_1 (half_mode, low[0],
    6659              :                                              low[0], low[0]));
    6660         1857 :           emit_insn (gen_add3_carry (half_mode, high[0], high[0], high[0],
    6661              :                                      x3, x4));
    6662              :         }
    6663              :       else
    6664              :         {
    6665         4202 :           gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
    6666              : 
    6667         4202 :           if (!rtx_equal_p (operands[0], operands[1]))
    6668            0 :             emit_move_insn (operands[0], operands[1]);
    6669              : 
    6670         4202 :           emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
    6671         4202 :           ix86_expand_ashl_const (low[0], count, mode);
    6672              :         }
    6673         8795 :       return;
    6674              :     }
    6675              : 
    6676         1697 :   split_double_mode (mode, operands, 1, low, high);
    6677         1697 :   half_mode = mode == DImode ? SImode : DImode;
    6678              : 
    6679         1697 :   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
    6680              : 
    6681         1697 :   if (operands[1] == const1_rtx)
    6682              :     {
    6683              :       /* Assuming we've chosen a QImode capable registers, then 1 << N
    6684              :          can be done with two 32/64-bit shifts, no branches, no cmoves.  */
    6685          272 :       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
    6686              :         {
    6687          159 :           rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
    6688              : 
    6689          159 :           ix86_expand_clear (low[0]);
    6690          159 :           ix86_expand_clear (high[0]);
    6691          159 :           emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
    6692              : 
    6693          159 :           d = gen_lowpart (QImode, low[0]);
    6694          159 :           d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
    6695          159 :           s = gen_rtx_EQ (QImode, flags, const0_rtx);
    6696          159 :           emit_insn (gen_rtx_SET (d, s));
    6697              : 
    6698          159 :           d = gen_lowpart (QImode, high[0]);
    6699          159 :           d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
    6700          159 :           s = gen_rtx_NE (QImode, flags, const0_rtx);
    6701          159 :           emit_insn (gen_rtx_SET (d, s));
    6702              :         }
    6703              : 
    6704              :       /* Otherwise, we can get the same results by manually performing
    6705              :          a bit extract operation on bit 5/6, and then performing the two
    6706              :          shifts.  The two methods of getting 0/1 into low/high are exactly
    6707              :          the same size.  Avoiding the shift in the bit extract case helps
    6708              :          pentium4 a bit; no one else seems to care much either way.  */
    6709              :       else
    6710              :         {
    6711          113 :           rtx (*gen_lshr3)(rtx, rtx, rtx);
    6712          113 :           rtx (*gen_and3)(rtx, rtx, rtx);
    6713          113 :           rtx (*gen_xor3)(rtx, rtx, rtx);
    6714          113 :           HOST_WIDE_INT bits;
    6715          113 :           rtx x;
    6716              : 
    6717          113 :           if (mode == DImode)
    6718              :             {
    6719              :               gen_lshr3 = gen_lshrsi3;
    6720              :               gen_and3 = gen_andsi3;
    6721              :               gen_xor3 = gen_xorsi3;
    6722              :               bits = 5;
    6723              :             }
    6724              :           else
    6725              :             {
    6726            0 :               gen_lshr3 = gen_lshrdi3;
    6727            0 :               gen_and3 = gen_anddi3;
    6728            0 :               gen_xor3 = gen_xordi3;
    6729            0 :               bits = 6;
    6730              :             }
    6731              : 
    6732          113 :           if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
    6733            0 :             x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
    6734              :           else
    6735          113 :             x = gen_lowpart (half_mode, operands[2]);
    6736          113 :           emit_insn (gen_rtx_SET (high[0], x));
    6737              : 
    6738          113 :           emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
    6739          113 :           emit_insn (gen_and3 (high[0], high[0], const1_rtx));
    6740          113 :           emit_move_insn (low[0], high[0]);
    6741          113 :           emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
    6742              :         }
    6743              : 
    6744          272 :       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
    6745          272 :       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
    6746          272 :       return;
    6747              :     }
    6748              : 
    6749         1425 :   if (operands[1] == constm1_rtx)
    6750              :     {
    6751              :       /* For -1 << N, we can avoid the shld instruction, because we
    6752              :          know that we're shifting 0...31/63 ones into a -1.  */
    6753          118 :       emit_move_insn (low[0], constm1_rtx);
    6754          118 :       if (optimize_insn_for_size_p ())
    6755            6 :         emit_move_insn (high[0], low[0]);
    6756              :       else
    6757          112 :         emit_move_insn (high[0], constm1_rtx);
    6758              :     }
    6759              :   else
    6760              :     {
    6761         1307 :       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
    6762              : 
    6763         1307 :       if (!rtx_equal_p (operands[0], operands[1]))
    6764            0 :         emit_move_insn (operands[0], operands[1]);
    6765              : 
    6766         1307 :       split_double_mode (mode, operands, 1, low, high);
    6767         1307 :       emit_insn (gen_shld (high[0], low[0], operands[2]));
    6768              :     }
    6769              : 
    6770         1425 :   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
    6771              : 
    6772         1425 :   if (TARGET_CMOVE && scratch)
    6773              :     {
    6774          947 :       ix86_expand_clear (scratch);
    6775          947 :       emit_insn (gen_x86_shift_adj_1
    6776              :                  (half_mode, high[0], low[0], operands[2], scratch));
    6777              :     }
    6778              :   else
    6779          478 :     emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
    6780              : }
    6781              : 
    6782              : void
    6783         6040 : ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
    6784              : {
    6785         4798 :   rtx (*gen_ashr3)(rtx, rtx, rtx)
    6786         6040 :     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
    6787         6040 :   rtx (*gen_shrd)(rtx, rtx, rtx);
    6788         6040 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6789              : 
    6790         6040 :   rtx low[2], high[2];
    6791         6040 :   int count;
    6792              : 
    6793         6040 :   if (CONST_INT_P (operands[2]))
    6794              :     {
    6795         5863 :       split_double_mode (mode, operands, 2, low, high);
    6796         5863 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6797              : 
    6798        11726 :       if (count == GET_MODE_BITSIZE (mode) - 1)
    6799              :         {
    6800           87 :           emit_move_insn (high[0], high[1]);
    6801           87 :           emit_insn (gen_ashr3 (high[0], high[0],
    6802           87 :                                 GEN_INT (half_width - 1)));
    6803           87 :           emit_move_insn (low[0], high[0]);
    6804              : 
    6805              :         }
    6806         5776 :       else if (count >= half_width)
    6807              :         {
    6808         1619 :           emit_move_insn (low[0], high[1]);
    6809         1619 :           emit_move_insn (high[0], low[0]);
    6810         1619 :           emit_insn (gen_ashr3 (high[0], high[0],
    6811         1619 :                                 GEN_INT (half_width - 1)));
    6812              : 
    6813         1619 :           if (count > half_width)
    6814           38 :             emit_insn (gen_ashr3 (low[0], low[0],
    6815           38 :                                   GEN_INT (count - half_width)));
    6816              :         }
    6817         4157 :       else if (count == 1
    6818          766 :                && (TARGET_USE_RCR || optimize_size > 1))
    6819              :         {
    6820            1 :           if (!rtx_equal_p (operands[0], operands[1]))
    6821            0 :             emit_move_insn (operands[0], operands[1]);
    6822            1 :           if (mode == DImode)
    6823              :             {
    6824            0 :               emit_insn (gen_ashrsi3_carry (high[0], high[0]));
    6825            0 :               emit_insn (gen_rcrsi2 (low[0], low[0]));
    6826              :             }
    6827              :           else
    6828              :             {
    6829            1 :               emit_insn (gen_ashrdi3_carry (high[0], high[0]));
    6830            1 :               emit_insn (gen_rcrdi2 (low[0], low[0]));
    6831              :             }
    6832              :         }
    6833              :       else
    6834              :         {
    6835         4156 :           gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6836              : 
    6837         4156 :           if (!rtx_equal_p (operands[0], operands[1]))
    6838            0 :             emit_move_insn (operands[0], operands[1]);
    6839              : 
    6840         4156 :           emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
    6841         4156 :           emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
    6842              :         }
    6843              :     }
    6844              :   else
    6845              :     {
    6846          177 :       machine_mode half_mode;
    6847              : 
    6848          177 :       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6849              : 
    6850          177 :      if (!rtx_equal_p (operands[0], operands[1]))
    6851            0 :         emit_move_insn (operands[0], operands[1]);
    6852              : 
    6853          177 :       split_double_mode (mode, operands, 1, low, high);
    6854          177 :       half_mode = mode == DImode ? SImode : DImode;
    6855              : 
    6856          177 :       emit_insn (gen_shrd (low[0], high[0], operands[2]));
    6857          177 :       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
    6858              : 
    6859          177 :       if (TARGET_CMOVE && scratch)
    6860              :         {
    6861          140 :           emit_move_insn (scratch, high[0]);
    6862          140 :           emit_insn (gen_ashr3 (scratch, scratch,
    6863          140 :                                 GEN_INT (half_width - 1)));
    6864          140 :           emit_insn (gen_x86_shift_adj_1
    6865              :                      (half_mode, low[0], high[0], operands[2], scratch));
    6866              :         }
    6867              :       else
    6868           37 :         emit_insn (gen_x86_shift_adj_3
    6869              :                    (half_mode, low[0], high[0], operands[2]));
    6870              :     }
    6871         6040 : }
    6872              : 
    6873              : void
    6874        13276 : ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
    6875              : {
    6876         5875 :   rtx (*gen_lshr3)(rtx, rtx, rtx)
    6877        13276 :     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
    6878        13276 :   rtx (*gen_shrd)(rtx, rtx, rtx);
    6879        13276 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6880              : 
    6881        13276 :   rtx low[2], high[2];
    6882        13276 :   int count;
    6883              : 
    6884        13276 :   if (CONST_INT_P (operands[2]))
    6885              :     {
    6886        11878 :       split_double_mode (mode, operands, 2, low, high);
    6887        11878 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6888              : 
    6889        11878 :       if (count >= half_width)
    6890              :         {
    6891         8468 :           emit_move_insn (low[0], high[1]);
    6892         8468 :           ix86_expand_clear (high[0]);
    6893              : 
    6894         8468 :           if (count > half_width)
    6895          651 :             emit_insn (gen_lshr3 (low[0], low[0],
    6896          651 :                                   GEN_INT (count - half_width)));
    6897              :         }
    6898         3410 :       else if (count == 1
    6899          676 :                && (TARGET_USE_RCR || optimize_size > 1))
    6900              :         {
    6901            1 :           if (!rtx_equal_p (operands[0], operands[1]))
    6902            0 :             emit_move_insn (operands[0], operands[1]);
    6903            1 :           if (mode == DImode)
    6904              :             {
    6905            0 :               emit_insn (gen_lshrsi3_carry (high[0], high[0]));
    6906            0 :               emit_insn (gen_rcrsi2 (low[0], low[0]));
    6907              :             }
    6908              :           else
    6909              :             {
    6910            1 :               emit_insn (gen_lshrdi3_carry (high[0], high[0]));
    6911            1 :               emit_insn (gen_rcrdi2 (low[0], low[0]));
    6912              :             }
    6913              :         }
    6914              :       else
    6915              :         {
    6916         3409 :           gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6917              : 
    6918         3409 :           if (!rtx_equal_p (operands[0], operands[1]))
    6919            0 :             emit_move_insn (operands[0], operands[1]);
    6920              : 
    6921         3409 :           emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
    6922         3409 :           emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
    6923              :         }
    6924              :     }
    6925              :   else
    6926              :     {
    6927         1398 :       machine_mode half_mode;
    6928              : 
    6929         1398 :       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6930              : 
    6931         1398 :       if (!rtx_equal_p (operands[0], operands[1]))
    6932            0 :         emit_move_insn (operands[0], operands[1]);
    6933              : 
    6934         1398 :       split_double_mode (mode, operands, 1, low, high);
    6935         1398 :       half_mode = mode == DImode ? SImode : DImode;
    6936              : 
    6937         1398 :       emit_insn (gen_shrd (low[0], high[0], operands[2]));
    6938         1398 :       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
    6939              : 
    6940         1398 :       if (TARGET_CMOVE && scratch)
    6941              :         {
    6942         1133 :           ix86_expand_clear (scratch);
    6943         1133 :           emit_insn (gen_x86_shift_adj_1
    6944              :                      (half_mode, low[0], high[0], operands[2], scratch));
    6945              :         }
    6946              :       else
    6947          265 :         emit_insn (gen_x86_shift_adj_2
    6948              :                    (half_mode, low[0], high[0], operands[2]));
    6949              :     }
    6950        13276 : }
    6951              : 
    6952              : /* Helper function to split TImode ashl under NDD.  */
    6953              : void
    6954            1 : ix86_split_ashl_ndd (rtx *operands, rtx scratch)
    6955              : {
    6956            1 :   gcc_assert (TARGET_APX_NDD);
    6957            1 :   int half_width = GET_MODE_BITSIZE (TImode) >> 1;
    6958              : 
    6959            1 :   rtx low[2], high[2];
    6960            1 :   int count;
    6961              : 
    6962            1 :   split_double_mode (TImode, operands, 2, low, high);
    6963            1 :   if (CONST_INT_P (operands[2]))
    6964              :     {
    6965            0 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
    6966              : 
    6967            0 :       if (count >= half_width)
    6968              :         {
    6969            0 :           count = count - half_width;
    6970            0 :           if (count == 0)
    6971              :             {
    6972            0 :               if (!rtx_equal_p (high[0], low[1]))
    6973            0 :                 emit_move_insn (high[0], low[1]);
    6974              :             }
    6975            0 :           else if (count == 1)
    6976            0 :             emit_insn (gen_adddi3 (high[0], low[1], low[1]));
    6977              :           else
    6978            0 :             emit_insn (gen_ashldi3 (high[0], low[1], GEN_INT (count)));
    6979              : 
    6980            0 :           ix86_expand_clear (low[0]);
    6981              :         }
    6982            0 :       else if (count == 1)
    6983              :         {
    6984            0 :           rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
    6985            0 :           rtx x4 = gen_rtx_LTU (TImode, x3, const0_rtx);
    6986            0 :           emit_insn (gen_add3_cc_overflow_1 (DImode, low[0],
    6987              :                                              low[1], low[1]));
    6988            0 :           emit_insn (gen_add3_carry (DImode, high[0], high[1], high[1],
    6989              :                                      x3, x4));
    6990              :         }
    6991              :       else
    6992              :         {
    6993            0 :           emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
    6994              :                                           GEN_INT (count)));
    6995            0 :           emit_insn (gen_ashldi3 (low[0], low[1], GEN_INT (count)));
    6996              :         }
    6997              :     }
    6998              :   else
    6999              :     {
    7000            1 :       emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
    7001              :                                       operands[2]));
    7002            1 :       emit_insn (gen_ashldi3 (low[0], low[1], operands[2]));
    7003            1 :       if (TARGET_CMOVE && scratch)
    7004              :         {
    7005            1 :           ix86_expand_clear (scratch);
    7006            1 :           emit_insn (gen_x86_shift_adj_1
    7007              :                      (DImode, high[0], low[0], operands[2], scratch));
    7008              :         }
    7009              :       else
    7010            0 :         emit_insn (gen_x86_shift_adj_2 (DImode, high[0], low[0], operands[2]));
    7011              :     }
    7012            1 : }
    7013              : 
    7014              : /* Helper function to split TImode l/ashr under NDD.  */
    7015              : void
    7016            2 : ix86_split_rshift_ndd (enum rtx_code code, rtx *operands, rtx scratch)
    7017              : {
    7018            2 :   gcc_assert (TARGET_APX_NDD);
    7019            2 :   int half_width = GET_MODE_BITSIZE (TImode) >> 1;
    7020            2 :   bool ashr_p = code == ASHIFTRT;
    7021            2 :   rtx (*gen_shr)(rtx, rtx, rtx) = ashr_p ? gen_ashrdi3
    7022              :                                          : gen_lshrdi3;
    7023              : 
    7024            2 :   rtx low[2], high[2];
    7025            2 :   int count;
    7026              : 
    7027            2 :   split_double_mode (TImode, operands, 2, low, high);
    7028            2 :   if (CONST_INT_P (operands[2]))
    7029              :     {
    7030            0 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
    7031              : 
    7032            0 :       if (ashr_p && (count == GET_MODE_BITSIZE (TImode) - 1))
    7033              :         {
    7034            0 :           emit_insn (gen_shr (high[0], high[1],
    7035              :                               GEN_INT (half_width - 1)));
    7036            0 :           emit_move_insn (low[0], high[0]);
    7037              :         }
    7038            0 :       else if (count >= half_width)
    7039              :         {
    7040            0 :           if (ashr_p)
    7041            0 :             emit_insn (gen_shr (high[0], high[1],
    7042              :                                 GEN_INT (half_width - 1)));
    7043              :           else
    7044            0 :             ix86_expand_clear (high[0]);
    7045              : 
    7046            0 :           if (count > half_width)
    7047            0 :             emit_insn (gen_shr (low[0], high[1],
    7048            0 :                                 GEN_INT (count - half_width)));
    7049              :           else
    7050            0 :             emit_move_insn (low[0], high[1]);
    7051              :         }
    7052              :       else
    7053              :         {
    7054            0 :           emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
    7055              :                                           GEN_INT (count)));
    7056            0 :           emit_insn (gen_shr (high[0], high[1], GEN_INT (count)));
    7057              :         }
    7058              :     }
    7059              :   else
    7060              :     {
    7061            2 :       emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
    7062              :                                       operands[2]));
    7063            2 :       emit_insn (gen_shr (high[0], high[1], operands[2]));
    7064              : 
    7065            2 :       if (TARGET_CMOVE && scratch)
    7066              :         {
    7067            2 :           if (ashr_p)
    7068              :             {
    7069            1 :               emit_move_insn (scratch, high[0]);
    7070            1 :               emit_insn (gen_shr (scratch, scratch,
    7071              :                                   GEN_INT (half_width - 1)));
    7072              :             }
    7073              :           else
    7074            1 :             ix86_expand_clear (scratch);
    7075              : 
    7076            2 :           emit_insn (gen_x86_shift_adj_1
    7077              :                      (DImode, low[0], high[0], operands[2], scratch));
    7078              :         }
    7079            0 :       else if (ashr_p)
    7080            0 :         emit_insn (gen_x86_shift_adj_3
    7081              :                    (DImode, low[0], high[0], operands[2]));
    7082              :       else
    7083            0 :         emit_insn (gen_x86_shift_adj_2
    7084              :                    (DImode, low[0], high[0], operands[2]));
    7085              :     }
    7086            2 : }
    7087              : 
    7088              : /* Expand move of V1TI mode register X to a new TI mode register.  */
    7089              : static rtx
    7090           17 : ix86_expand_v1ti_to_ti (rtx x)
    7091              : {
    7092           17 :   rtx result = gen_reg_rtx (TImode);
    7093           17 :   if (TARGET_SSE2)
    7094              :     {
    7095           17 :       rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
    7096           17 :       rtx lo = gen_lowpart (DImode, result);
    7097           17 :       emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
    7098           17 :       rtx hi = gen_highpart (DImode, result);
    7099           17 :       emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
    7100              :     }
    7101              :   else
    7102            0 :     emit_move_insn (result, gen_lowpart (TImode, x));
    7103           17 :   return result;
    7104              : }
    7105              : 
    7106              : /* Expand move of TI mode register X to a new V1TI mode register.  */
    7107              : static rtx
    7108           17 : ix86_expand_ti_to_v1ti (rtx x)
    7109              : {
    7110           17 :   if (TARGET_SSE2)
    7111              :     {
    7112           17 :       rtx lo = gen_lowpart (DImode, x);
    7113           17 :       rtx hi = gen_highpart (DImode, x);
    7114           17 :       rtx tmp = gen_reg_rtx (V2DImode);
    7115           17 :       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
    7116           17 :       return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
    7117              :     }
    7118              : 
    7119            0 :   return force_reg (V1TImode, gen_lowpart (V1TImode, x));
    7120              : }
    7121              : 
    7122              : /* Expand V1TI mode shift (of rtx_code CODE) by constant.  */
    7123              : void
    7124           42 : ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
    7125              : {
    7126           42 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7127              : 
    7128           42 :   if (!CONST_INT_P (operands[2]))
    7129              :     {
    7130            6 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7131            6 :       rtx tmp2 = gen_reg_rtx (TImode);
    7132            3 :       rtx (*shift) (rtx, rtx, rtx)
    7133            6 :             = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
    7134            6 :       emit_insn (shift (tmp2, tmp1, operands[2]));
    7135            6 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7136            6 :       emit_move_insn (operands[0], tmp3);
    7137            6 :       return;
    7138              :     }
    7139              : 
    7140           36 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7141              : 
    7142           36 :   if (bits == 0)
    7143              :     {
    7144            0 :       emit_move_insn (operands[0], op1);
    7145            0 :       return;
    7146              :     }
    7147              : 
    7148           36 :   if ((bits & 7) == 0)
    7149              :     {
    7150            0 :       rtx tmp = gen_reg_rtx (V1TImode);
    7151            0 :       if (code == ASHIFT)
    7152            0 :         emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
    7153              :       else
    7154            0 :         emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
    7155            0 :       emit_move_insn (operands[0], tmp);
    7156            0 :       return;
    7157              :     }
    7158              : 
    7159           36 :   rtx tmp1 = gen_reg_rtx (V1TImode);
    7160           36 :   if (code == ASHIFT)
    7161           18 :     emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
    7162              :   else
    7163           18 :     emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
    7164              : 
    7165              :   /* tmp2 is operands[1] shifted by 64, in V2DImode.  */
    7166           36 :   rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7167              : 
    7168              :   /* tmp3 will be the V2DImode result.  */
    7169           36 :   rtx tmp3 = gen_reg_rtx (V2DImode);
    7170              : 
    7171           36 :   if (bits > 64)
    7172              :     {
    7173           18 :       if (code == ASHIFT)
    7174            9 :         emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
    7175              :       else
    7176            9 :         emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
    7177              :     }
    7178              :   else
    7179              :     {
    7180              :       /* tmp4 is operands[1], in V2DImode.  */
    7181           18 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7182              : 
    7183           18 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7184           18 :       if (code == ASHIFT)
    7185            9 :         emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7186              :       else
    7187            9 :         emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7188              : 
    7189           18 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7190           18 :       if (code == ASHIFT)
    7191            9 :         emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
    7192              :       else
    7193            9 :         emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
    7194              : 
    7195           18 :       emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
    7196              :     }
    7197              : 
    7198              :   /* Convert the result back to V1TImode and store in operands[0].  */
    7199           36 :   rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7200           36 :   emit_move_insn (operands[0], tmp7);
    7201              : }
    7202              : 
    7203              : /* Expand V1TI mode rotate (of rtx_code CODE) by constant.  */
    7204              : void
    7205           39 : ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
    7206              : {
    7207           39 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7208              : 
    7209           39 :   if (!CONST_INT_P (operands[2]))
    7210              :     {
    7211            8 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7212            8 :       rtx tmp2 = gen_reg_rtx (TImode);
    7213            4 :       rtx (*rotate) (rtx, rtx, rtx)
    7214            8 :             = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
    7215            8 :       emit_insn (rotate (tmp2, tmp1, operands[2]));
    7216            8 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7217            8 :       emit_move_insn (operands[0], tmp3);
    7218            8 :       return;
    7219              :     }
    7220              : 
    7221           31 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7222              : 
    7223           31 :   if (bits == 0)
    7224              :     {
    7225            0 :       emit_move_insn (operands[0], op1);
    7226            0 :       return;
    7227              :     }
    7228              : 
    7229           31 :   if (code == ROTATERT)
    7230           16 :     bits = 128 - bits;
    7231              : 
    7232           31 :   if ((bits & 31) == 0)
    7233              :     {
    7234            5 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7235            5 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7236            5 :       if (bits == 32)
    7237            1 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
    7238            4 :       else if (bits == 64)
    7239            2 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
    7240              :       else
    7241            2 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
    7242            5 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
    7243            5 :       return;
    7244              :     }
    7245              : 
    7246           26 :   if ((bits & 7) == 0)
    7247              :     {
    7248            6 :       rtx tmp1 = gen_reg_rtx (V1TImode);
    7249            6 :       rtx tmp2 = gen_reg_rtx (V1TImode);
    7250            6 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7251              : 
    7252            6 :       emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
    7253            6 :       emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
    7254            6 :       emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
    7255            6 :       emit_move_insn (operands[0], tmp3);
    7256            6 :       return;
    7257              :     }
    7258              : 
    7259           20 :   rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7260              : 
    7261           20 :   rtx lobits;
    7262           20 :   rtx hibits;
    7263              : 
    7264           20 :   switch (bits >> 5)
    7265              :     {
    7266            7 :     case 0:
    7267            7 :       lobits = op1_v4si;
    7268            7 :       hibits = gen_reg_rtx (V4SImode);
    7269            7 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
    7270            7 :       break;
    7271              : 
    7272            2 :     case 1:
    7273            2 :       lobits = gen_reg_rtx (V4SImode);
    7274            2 :       hibits = gen_reg_rtx (V4SImode);
    7275            2 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
    7276            2 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
    7277            2 :       break;
    7278              : 
    7279            2 :     case 2:
    7280            2 :       lobits = gen_reg_rtx (V4SImode);
    7281            2 :       hibits = gen_reg_rtx (V4SImode);
    7282            2 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
    7283            2 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
    7284            2 :       break;
    7285              : 
    7286            9 :     default:
    7287            9 :       lobits = gen_reg_rtx (V4SImode);
    7288            9 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
    7289            9 :       hibits = op1_v4si;
    7290            9 :       break;
    7291              :     }
    7292              : 
    7293           20 :   rtx tmp1 = gen_reg_rtx (V4SImode);
    7294           20 :   rtx tmp2 = gen_reg_rtx (V4SImode);
    7295           20 :   rtx tmp3 = gen_reg_rtx (V4SImode);
    7296              : 
    7297           20 :   emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
    7298           20 :   emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
    7299           20 :   emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
    7300              : 
    7301           20 :   emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
    7302              : }
    7303              : 
    7304              : /* Expand V1TI mode ashiftrt by constant.  */
    7305              : void
    7306          109 : ix86_expand_v1ti_ashiftrt (rtx operands[])
    7307              : {
    7308          109 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7309              : 
    7310          109 :   if (!CONST_INT_P (operands[2]))
    7311              :     {
    7312            3 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7313            3 :       rtx tmp2 = gen_reg_rtx (TImode);
    7314            3 :       emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
    7315            3 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7316            3 :       emit_move_insn (operands[0], tmp3);
    7317            3 :       return;
    7318              :     }
    7319              : 
    7320          106 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7321              : 
    7322          106 :   if (bits == 0)
    7323              :     {
    7324            0 :       emit_move_insn (operands[0], op1);
    7325            0 :       return;
    7326              :     }
    7327              : 
    7328          106 :   if (bits == 127)
    7329              :     {
    7330              :       /* Two operations.  */
    7331            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7332            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7333            3 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7334              : 
    7335            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7336            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7337              : 
    7338            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
    7339            3 :       return;
    7340              :     }
    7341              : 
    7342          103 :   if (bits == 64)
    7343              :     {
    7344              :       /* Three operations.  */
    7345            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7346            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7347            3 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7348              : 
    7349            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7350            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7351              : 
    7352            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7353            3 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7354            3 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7355            3 :       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
    7356              : 
    7357            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7358            3 :       return;
    7359              :     }
    7360              : 
    7361          100 :   if (bits == 96)
    7362              :     {
    7363              :       /* Three operations.  */
    7364            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7365            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7366            3 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
    7367              : 
    7368            3 :       rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7369            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
    7370            3 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7371            3 :       emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
    7372              : 
    7373            3 :       rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
    7374            3 :       rtx tmp7 = gen_reg_rtx (V4SImode);
    7375            3 :       emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
    7376              : 
    7377            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
    7378            3 :       return;
    7379              :     }
    7380              : 
    7381           97 :   if (bits >= 111)
    7382              :     {
    7383              :       /* Three operations.  */
    7384           21 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7385           21 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7386           21 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
    7387              : 
    7388           21 :       rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7389           21 :       rtx tmp4 = gen_reg_rtx (V8HImode);
    7390           21 :       emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
    7391              : 
    7392           21 :       rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
    7393           21 :       rtx tmp6 = gen_reg_rtx (V4SImode);
    7394           21 :       emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
    7395              : 
    7396           21 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7397           21 :       return;
    7398              :     }
    7399              : 
    7400           76 :   if (TARGET_AVX2 || TARGET_SSE4_1)
    7401              :     {
    7402              :       /* Three operations.  */
    7403           50 :       if (bits == 32)
    7404              :         {
    7405            2 :           rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7406            2 :           rtx tmp2 = gen_reg_rtx (V4SImode);
    7407            2 :           emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
    7408              : 
    7409            2 :           rtx tmp3 = gen_reg_rtx (V1TImode);
    7410            2 :           emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
    7411              : 
    7412            2 :           if (TARGET_AVX2)
    7413              :             {
    7414            1 :               rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
    7415            1 :               rtx tmp5 = gen_reg_rtx (V4SImode);
    7416            1 :               emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
    7417              :                                                GEN_INT (7)));
    7418              : 
    7419            1 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
    7420              :             }
    7421              :           else
    7422              :             {
    7423            1 :               rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7424            1 :               rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7425            1 :               rtx tmp6 = gen_reg_rtx (V8HImode);
    7426            1 :               emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
    7427              :                                              GEN_INT (0x3f)));
    7428              : 
    7429            1 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7430              :             }
    7431            2 :           return;
    7432              :         }
    7433              : 
    7434              :       /* Three operations.  */
    7435           48 :       if (bits == 8 || bits == 16 || bits == 24)
    7436              :         {
    7437            6 :           rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7438            6 :           rtx tmp2 = gen_reg_rtx (V4SImode);
    7439            6 :           emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7440              : 
    7441            6 :           rtx tmp3 = gen_reg_rtx (V1TImode);
    7442            6 :           emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
    7443              : 
    7444            6 :           if (TARGET_AVX2)
    7445              :             {
    7446            3 :               rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
    7447            3 :               rtx tmp5 = gen_reg_rtx (V4SImode);
    7448            3 :               emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
    7449              :                                                GEN_INT (7)));
    7450              : 
    7451            3 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
    7452              :             }
    7453              :           else
    7454              :             {
    7455            3 :               rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7456            3 :               rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7457            3 :               rtx tmp6 = gen_reg_rtx (V8HImode);
    7458            3 :               emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
    7459              :                                              GEN_INT (0x3f)));
    7460              : 
    7461            3 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7462              :             }
    7463            6 :           return;
    7464              :         }
    7465              :     }
    7466              : 
    7467           68 :   if (bits > 96)
    7468              :     {
    7469              :       /* Four operations.  */
    7470            3 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7471            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7472            3 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
    7473              : 
    7474            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7475            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
    7476              : 
    7477            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
    7478            3 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7479            3 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7480            3 :       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
    7481              : 
    7482            3 :       rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
    7483            3 :       rtx tmp8 = gen_reg_rtx (V4SImode);
    7484            3 :       emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
    7485              : 
    7486            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
    7487            3 :       return;
    7488              :     }
    7489              : 
    7490           65 :   if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
    7491              :     {
    7492              :       /* Four operations.  */
    7493            4 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7494            4 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7495            4 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7496              : 
    7497            4 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7498            4 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7499              : 
    7500            4 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7501            4 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
    7502              : 
    7503            4 :       rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7504            4 :       rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
    7505            4 :       rtx tmp7 = gen_reg_rtx (V8HImode);
    7506            6 :       emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
    7507              :                                      GEN_INT (bits == 48 ? 0x1f : 0x07)));
    7508              : 
    7509            4 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
    7510            4 :       return;
    7511              :     }
    7512              : 
    7513           61 :   if ((bits & 7) == 0)
    7514              :     {
    7515              :       /* Five operations.  */
    7516            9 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7517            9 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7518            9 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7519              : 
    7520            9 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7521            9 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7522              : 
    7523            9 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7524            9 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
    7525              : 
    7526            9 :       rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7527            9 :       rtx tmp6 = gen_reg_rtx (V1TImode);
    7528            9 :       emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
    7529              : 
    7530            9 :       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7531            9 :       rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
    7532            9 :       rtx tmp9 = gen_reg_rtx (V2DImode);
    7533            9 :       emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
    7534              : 
    7535            9 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
    7536            9 :       return;
    7537              :     }
    7538              : 
    7539           52 :   if (TARGET_AVX2 && bits < 32)
    7540              :     {
    7541              :       /* Six operations.  */
    7542            9 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7543            9 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7544            9 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7545              : 
    7546            9 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7547            9 :       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
    7548              : 
    7549            9 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7550            9 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7551            9 :       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7552              : 
    7553            9 :       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7554            9 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7555            9 :       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
    7556              : 
    7557            9 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7558            9 :       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
    7559              : 
    7560            9 :       rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
    7561            9 :       rtx tmp10 = gen_reg_rtx (V4SImode);
    7562            9 :       emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
    7563              : 
    7564            9 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
    7565            9 :       return;
    7566              :     }
    7567              : 
    7568           43 :   if (TARGET_SSE4_1 && bits < 15)
    7569              :     {
    7570              :       /* Six operations.  */
    7571            4 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7572            4 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7573            4 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7574              : 
    7575            4 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7576            4 :       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
    7577              : 
    7578            4 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7579            4 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7580            4 :       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7581              : 
    7582            4 :       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7583            4 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7584            4 :       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
    7585              : 
    7586            4 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7587            4 :       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
    7588              : 
    7589            4 :       rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7590            4 :       rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
    7591            4 :       rtx tmp11 = gen_reg_rtx (V8HImode);
    7592            4 :       emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
    7593              : 
    7594            4 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
    7595            4 :       return;
    7596              :     }
    7597              : 
    7598           18 :   if (bits == 1)
    7599              :     {
    7600              :       /* Eight operations.  */
    7601            1 :       rtx tmp1 = gen_reg_rtx (V1TImode);
    7602            1 :       emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
    7603              : 
    7604            1 :       rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7605            1 :       rtx tmp3 = gen_reg_rtx (V2DImode);
    7606            1 :       emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
    7607              : 
    7608            1 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7609            1 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7610            1 :       emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
    7611              : 
    7612            1 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7613            1 :       emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
    7614              : 
    7615            1 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7616            1 :       emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
    7617              : 
    7618            1 :       rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
    7619            1 :       rtx tmp9 = gen_reg_rtx (V4SImode);
    7620            1 :       emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
    7621              : 
    7622            1 :       rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
    7623            1 :       rtx tmp11 = gen_reg_rtx (V2DImode);
    7624            1 :       emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
    7625              : 
    7626            1 :       rtx tmp12 = gen_reg_rtx (V2DImode);
    7627            1 :       emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
    7628              : 
    7629            1 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
    7630            1 :       return;
    7631              :     }
    7632              : 
    7633           38 :   if (bits > 64)
    7634              :     {
    7635              :       /* Eight operations.  */
    7636           12 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7637           12 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7638           12 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7639              : 
    7640           12 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7641           12 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7642              : 
    7643           12 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7644           12 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
    7645              : 
    7646           12 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7647           12 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7648           12 :       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
    7649              : 
    7650           12 :       rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7651           12 :       rtx tmp8 = gen_reg_rtx (V1TImode);
    7652           12 :       emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
    7653              : 
    7654           12 :       rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7655           12 :       rtx tmp10 = gen_reg_rtx (V2DImode);
    7656           12 :       emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
    7657              : 
    7658           12 :       rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
    7659           12 :       rtx tmp12 = gen_reg_rtx (V2DImode);
    7660           12 :       emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
    7661              : 
    7662           12 :       rtx tmp13 = gen_reg_rtx (V2DImode);
    7663           12 :       emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
    7664              : 
    7665           12 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
    7666              :     }
    7667              :   else
    7668              :     {
    7669              :       /* Nine operations.  */
    7670           26 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7671           26 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7672           26 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7673              : 
    7674           26 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7675           26 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7676              : 
    7677           26 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7678           26 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
    7679              : 
    7680           26 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7681           26 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7682           26 :       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
    7683              : 
    7684           26 :       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7685           26 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7686           26 :       emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
    7687              : 
    7688           26 :       rtx tmp9 = gen_reg_rtx (V2DImode);
    7689           26 :       emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
    7690              : 
    7691           26 :       rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7692           26 :       rtx tmp11 = gen_reg_rtx (V1TImode);
    7693           26 :       emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
    7694              : 
    7695           26 :       rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
    7696           26 :       rtx tmp13 = gen_reg_rtx (V2DImode);
    7697           26 :       emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
    7698              : 
    7699           26 :       rtx tmp14 = gen_reg_rtx (V2DImode);
    7700           26 :       emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
    7701              : 
    7702           26 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
    7703              :     }
    7704              : }
    7705              : 
    7706              : /* Expand V2DI mode ashiftrt.  */
    7707              : void
    7708          414 : ix86_expand_v2di_ashiftrt (rtx operands[])
    7709              : {
    7710          414 :   if (operands[2] == const0_rtx)
    7711              :     {
    7712            0 :       emit_move_insn (operands[0], operands[1]);
    7713            0 :       return;
    7714              :     }
    7715              : 
    7716          414 :   if (TARGET_SSE4_2
    7717          133 :       && CONST_INT_P (operands[2])
    7718          133 :       && UINTVAL (operands[2]) >= 63
    7719          422 :       && !optimize_insn_for_size_p ())
    7720              :     {
    7721            8 :       rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
    7722            8 :       emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
    7723            8 :       return;
    7724              :     }
    7725              : 
    7726          406 :   if (CONST_INT_P (operands[2])
    7727          386 :       && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
    7728              :     {
    7729          290 :       vec_perm_builder sel (4, 4, 1);
    7730          290 :       sel.quick_grow (4);
    7731          290 :       rtx arg0, arg1;
    7732          290 :       rtx op1 = lowpart_subreg (V4SImode,
    7733              :                                 force_reg (V2DImode, operands[1]),
    7734              :                                 V2DImode);
    7735          290 :       rtx target = gen_reg_rtx (V4SImode);
    7736          290 :       if (UINTVAL (operands[2]) >= 63)
    7737              :         {
    7738           99 :           arg0 = arg1 = gen_reg_rtx (V4SImode);
    7739           99 :           emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
    7740           99 :           sel[0] = 1;
    7741           99 :           sel[1] = 1;
    7742           99 :           sel[2] = 3;
    7743           99 :           sel[3] = 3;
    7744              :         }
    7745          191 :       else if (INTVAL (operands[2]) > 32)
    7746              :         {
    7747           18 :           arg0 = gen_reg_rtx (V4SImode);
    7748           18 :           arg1 = gen_reg_rtx (V4SImode);
    7749           18 :           emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
    7750           18 :           emit_insn (gen_ashrv4si3 (arg0, op1,
    7751           18 :                                     GEN_INT (INTVAL (operands[2]) - 32)));
    7752           18 :           sel[0] = 1;
    7753           18 :           sel[1] = 5;
    7754           18 :           sel[2] = 3;
    7755           18 :           sel[3] = 7;
    7756              :         }
    7757          173 :       else if (INTVAL (operands[2]) == 32)
    7758              :         {
    7759            3 :           arg0 = op1;
    7760            3 :           arg1 = gen_reg_rtx (V4SImode);
    7761            3 :           emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
    7762            3 :           sel[0] = 1;
    7763            3 :           sel[1] = 5;
    7764            3 :           sel[2] = 3;
    7765            3 :           sel[3] = 7;
    7766              :         }
    7767              :       else
    7768              :         {
    7769          170 :           arg0 = gen_reg_rtx (V2DImode);
    7770          170 :           arg1 = gen_reg_rtx (V4SImode);
    7771          170 :           emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
    7772          170 :           emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
    7773          170 :           arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
    7774          170 :           sel[0] = 0;
    7775          170 :           sel[1] = 5;
    7776          170 :           sel[2] = 2;
    7777          170 :           sel[3] = 7;
    7778              :         }
    7779          389 :       vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
    7780          290 :       rtx op0 = operands[0];
    7781          290 :       bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
    7782              :                                                   target, arg0, arg1,
    7783              :                                                   indices);
    7784          290 :       gcc_assert (ok);
    7785          290 :       emit_move_insn (op0, lowpart_subreg (V2DImode, target, V4SImode));
    7786          290 :       return;
    7787          290 :     }
    7788          116 :   if (!TARGET_XOP)
    7789              :     {
    7790           20 :       rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
    7791           20 :       rtx zero_or_all_ones;
    7792           20 :       if (TARGET_SSE4_2)
    7793              :         {
    7794            0 :           zero_or_all_ones = gen_reg_rtx (V2DImode);
    7795            0 :           emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
    7796              :                                          operands[1]));
    7797              :         }
    7798              :       else
    7799              :         {
    7800           20 :           rtx temp = gen_reg_rtx (V4SImode);
    7801           20 :           emit_insn (gen_ashrv4si3 (temp,
    7802              :                                     lowpart_subreg (V4SImode,
    7803              :                                                     force_reg (V2DImode,
    7804              :                                                                operands[1]),
    7805              :                                                     V2DImode),
    7806              :                                     GEN_INT (31)));
    7807           20 :           zero_or_all_ones = gen_reg_rtx (V4SImode);
    7808           20 :           emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
    7809              :                                         const1_rtx, const1_rtx,
    7810              :                                         GEN_INT (3), GEN_INT (3)));
    7811           20 :           zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
    7812              :                                              V4SImode);
    7813              :         }
    7814           20 :       rtx lshr_res = gen_reg_rtx (V2DImode);
    7815           20 :       emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
    7816           20 :       rtx ashl_res = gen_reg_rtx (V2DImode);
    7817           20 :       rtx amount;
    7818           20 :       if (TARGET_64BIT)
    7819              :         {
    7820           20 :           amount = gen_reg_rtx (DImode);
    7821           20 :           emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
    7822              :                                  operands[2]));
    7823              :         }
    7824              :       else
    7825              :         {
    7826            0 :           rtx temp = gen_reg_rtx (SImode);
    7827            0 :           emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
    7828              :                                  lowpart_subreg (SImode, operands[2],
    7829              :                                                  DImode)));
    7830            0 :           amount = gen_reg_rtx (V4SImode);
    7831            0 :           emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
    7832              :                                         temp));
    7833              :         }
    7834           20 :       amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
    7835           20 :       emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
    7836           20 :       emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
    7837           20 :       return;
    7838              :     }
    7839              : 
    7840           96 :   rtx reg = gen_reg_rtx (V2DImode);
    7841           96 :   rtx par;
    7842           96 :   bool negate = false;
    7843           96 :   int i;
    7844              : 
    7845           96 :   if (CONST_INT_P (operands[2]))
    7846           96 :     operands[2] = GEN_INT (-INTVAL (operands[2]));
    7847              :   else
    7848              :     negate = true;
    7849              : 
    7850           96 :   par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
    7851          288 :   for (i = 0; i < 2; i++)
    7852          192 :     XVECEXP (par, 0, i) = operands[2];
    7853              : 
    7854           96 :   emit_insn (gen_vec_initv2didi (reg, par));
    7855              : 
    7856           96 :   if (negate)
    7857            0 :     emit_insn (gen_negv2di2 (reg, reg));
    7858              : 
    7859           96 :   emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
    7860              : }
    7861              : 
    7862              : /* Replace all occurrences of REG FROM with REG TO in X, including
    7863              :    occurrences with different modes.  */
    7864              : 
    7865              : rtx
    7866        38540 : ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
    7867              : {
    7868        38540 :   gcc_checking_assert (REG_P (from)
    7869              :                        && REG_P (to)
    7870              :                        && GET_MODE (from) == GET_MODE (to));
    7871        38540 :   if (!reg_overlap_mentioned_p (from, x))
    7872              :     return x;
    7873           94 :   rtx ret = copy_rtx (x);
    7874           94 :   subrtx_ptr_iterator::array_type array;
    7875          458 :   FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
    7876              :     {
    7877          364 :       rtx *loc = *iter;
    7878          364 :       x = *loc;
    7879          364 :       if (REG_P (x) && REGNO (x) == REGNO (from))
    7880              :         {
    7881           94 :           if (x == from)
    7882           94 :             *loc = to;
    7883              :           else
    7884              :             {
    7885            0 :               gcc_checking_assert (REG_NREGS (x) == 1);
    7886            0 :               *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
    7887              :             }
    7888              :         }
    7889              :     }
    7890           94 :   return ret;
    7891           94 : }
    7892              : 
    7893              : /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
    7894              :    DImode for constant loop counts.  */
    7895              : 
    7896              : static machine_mode
    7897        34736 : counter_mode (rtx count_exp)
    7898              : {
    7899         8116 :   if (GET_MODE (count_exp) != VOIDmode)
    7900        27910 :     return GET_MODE (count_exp);
    7901         6826 :   if (!CONST_INT_P (count_exp))
    7902            0 :     return Pmode;
    7903              :   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
    7904              :     return DImode;
    7905              :   return SImode;
    7906              : }
    7907              : 
    7908              : /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
    7909              :    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
    7910              :    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
    7911              :    memory by VALUE (supposed to be in MODE).
    7912              : 
    7913              :    The size is rounded down to whole number of chunk size moved at once.
    7914              :    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
    7915              : 
    7916              : 
    7917              : static void
    7918        18963 : expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
    7919              :                                rtx destptr, rtx srcptr, rtx value,
    7920              :                                rtx count, machine_mode mode, int unroll,
    7921              :                                int expected_size, bool issetmem)
    7922              : {
    7923        18963 :   rtx_code_label *out_label = nullptr;
    7924        18963 :   rtx_code_label *top_label = nullptr;
    7925        18963 :   rtx iter, tmp;
    7926        18963 :   machine_mode iter_mode = counter_mode (count);
    7927        18963 :   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
    7928        18963 :   rtx piece_size = GEN_INT (piece_size_n);
    7929        37926 :   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
    7930        18963 :   rtx size;
    7931        18963 :   int i;
    7932        18963 :   int loop_count;
    7933              : 
    7934        18963 :   if (expected_size != -1 && CONST_INT_P (count))
    7935         6742 :     loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
    7936              :   else
    7937              :     loop_count = -1;
    7938              : 
    7939              :   /* Don't generate the loop if the loop count is 1.  */
    7940         6742 :   if (loop_count != 1)
    7941              :     {
    7942        18891 :       top_label = gen_label_rtx ();
    7943        18891 :       out_label = gen_label_rtx ();
    7944              :     }
    7945        18963 :   iter = gen_reg_rtx (iter_mode);
    7946              : 
    7947        18963 :   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
    7948              :                               NULL, 1, OPTAB_DIRECT);
    7949              :   /* Those two should combine.  */
    7950        18963 :   if (piece_size == const1_rtx)
    7951              :     {
    7952         4154 :       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
    7953              :                                true, out_label);
    7954         4154 :       predict_jump (REG_BR_PROB_BASE * 10 / 100);
    7955              :     }
    7956        18963 :   emit_move_insn (iter, const0_rtx);
    7957              : 
    7958        18963 :   if (loop_count != 1)
    7959        18891 :     emit_label (top_label);
    7960              : 
    7961        21713 :   tmp = convert_modes (Pmode, iter_mode, iter, true);
    7962              : 
    7963              :   /* This assert could be relaxed - in this case we'll need to compute
    7964              :      smallest power of two, containing in PIECE_SIZE_N and pass it to
    7965              :      offset_address.  */
    7966        18963 :   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
    7967        18963 :   destmem = offset_address (destmem, tmp, piece_size_n);
    7968        18963 :   destmem = adjust_address (destmem, mode, 0);
    7969              : 
    7970        18963 :   if (!issetmem)
    7971              :     {
    7972        12541 :       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
    7973        12541 :       srcmem = adjust_address (srcmem, mode, 0);
    7974              : 
    7975              :       /* When unrolling for chips that reorder memory reads and writes,
    7976              :          we can save registers by using single temporary.
    7977              :          Also using 4 temporaries is overkill in 32bit mode.  */
    7978        12541 :       if (!TARGET_64BIT && 0)
    7979              :         {
    7980              :           for (i = 0; i < unroll; i++)
    7981              :             {
    7982              :               if (i)
    7983              :                 {
    7984              :                   destmem = adjust_address (copy_rtx (destmem), mode,
    7985              :                                             GET_MODE_SIZE (mode));
    7986              :                   srcmem = adjust_address (copy_rtx (srcmem), mode,
    7987              :                                            GET_MODE_SIZE (mode));
    7988              :                 }
    7989              :               emit_move_insn (destmem, srcmem);
    7990              :             }
    7991              :         }
    7992              :       else
    7993              :         {
    7994        12541 :           rtx tmpreg[4];
    7995        12541 :           gcc_assert (unroll <= 4);
    7996        52525 :           for (i = 0; i < unroll; i++)
    7997              :             {
    7998        39984 :               tmpreg[i] = gen_reg_rtx (mode);
    7999        39984 :               if (i)
    8000        54886 :                 srcmem = adjust_address (copy_rtx (srcmem), mode,
    8001              :                                          GET_MODE_SIZE (mode));
    8002        39984 :               emit_move_insn (tmpreg[i], srcmem);
    8003              :             }
    8004        52525 :           for (i = 0; i < unroll; i++)
    8005              :             {
    8006        39984 :               if (i)
    8007        54886 :                 destmem = adjust_address (copy_rtx (destmem), mode,
    8008              :                                           GET_MODE_SIZE (mode));
    8009        39984 :               emit_move_insn (destmem, tmpreg[i]);
    8010              :             }
    8011              :         }
    8012              :     }
    8013              :   else
    8014        29668 :     for (i = 0; i < unroll; i++)
    8015              :       {
    8016        23246 :         if (i)
    8017        33648 :           destmem = adjust_address (copy_rtx (destmem), mode,
    8018              :                                     GET_MODE_SIZE (mode));
    8019        23246 :         emit_move_insn (destmem, value);
    8020              :       }
    8021              : 
    8022        18963 :   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
    8023              :                              true, OPTAB_LIB_WIDEN);
    8024        18963 :   if (tmp != iter)
    8025            0 :     emit_move_insn (iter, tmp);
    8026              : 
    8027        18963 :   if (loop_count != 1)
    8028              :     {
    8029        18891 :       emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
    8030              :                                true, top_label);
    8031        18891 :       if (expected_size != -1)
    8032              :         {
    8033         9144 :           expected_size /= GET_MODE_SIZE (mode) * unroll;
    8034         9144 :           if (expected_size == 0)
    8035            1 :             predict_jump (0);
    8036         9143 :           else if (expected_size > REG_BR_PROB_BASE)
    8037            2 :             predict_jump (REG_BR_PROB_BASE - 1);
    8038              :           else
    8039         9141 :             predict_jump (REG_BR_PROB_BASE
    8040         9141 :                           - (REG_BR_PROB_BASE + expected_size / 2)
    8041         9141 :                             / expected_size);
    8042              :         }
    8043              :       else
    8044         9747 :         predict_jump (REG_BR_PROB_BASE * 80 / 100);
    8045              :     }
    8046        18963 :   iter = ix86_zero_extend_to_Pmode (iter);
    8047        21713 :   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
    8048              :                              true, OPTAB_LIB_WIDEN);
    8049        18963 :   if (tmp != destptr)
    8050            0 :     emit_move_insn (destptr, tmp);
    8051        18963 :   if (!issetmem)
    8052              :     {
    8053        13895 :       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
    8054              :                                  true, OPTAB_LIB_WIDEN);
    8055        12541 :       if (tmp != srcptr)
    8056            0 :         emit_move_insn (srcptr, tmp);
    8057              :     }
    8058        18963 :   if (loop_count != 1)
    8059        18891 :     emit_label (out_label);
    8060        18963 : }
    8061              : 
    8062              : /* Divide COUNTREG by SCALE.  */
    8063              : static rtx
    8064        16573 : scale_counter (rtx countreg, int scale)
    8065              : {
    8066        16573 :   rtx sc;
    8067              : 
    8068        16573 :   if (scale == 1)
    8069              :     return countreg;
    8070        10607 :   if (CONST_INT_P (countreg))
    8071        10591 :     return GEN_INT (INTVAL (countreg) / scale);
    8072           16 :   gcc_assert (REG_P (countreg));
    8073              : 
    8074           48 :   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
    8075           32 :                             GEN_INT (exact_log2 (scale)),
    8076              :                             NULL, 1, OPTAB_DIRECT);
    8077           16 :   return sc;
    8078              : }
    8079              : 
    8080              : /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
    8081              :    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
    8082              :    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
    8083              :    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
    8084              :    ORIG_VALUE is the original value passed to memset to fill the memory with.
    8085              :    Other arguments have same meaning as for previous function.  */
    8086              : 
    8087              : static void
    8088        16573 : expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
    8089              :                            rtx destptr, rtx srcptr, rtx value, rtx orig_value,
    8090              :                            rtx count,
    8091              :                            machine_mode mode, bool issetmem)
    8092              : {
    8093        16573 :   rtx destexp;
    8094        16573 :   rtx srcexp;
    8095        16573 :   rtx countreg;
    8096        16573 :   HOST_WIDE_INT rounded_count;
    8097              : 
    8098              :   /* If possible, it is shorter to use rep movs.
    8099              :      TODO: Maybe it is better to move this logic to decide_alg.  */
    8100        16573 :   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
    8101          243 :       && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
    8102          239 :       && (!issetmem || orig_value == const0_rtx))
    8103        16573 :     mode = SImode;
    8104              : 
    8105        16573 :   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
    8106        16310 :     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
    8107              : 
    8108        33146 :   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
    8109        16573 :                                                        GET_MODE_SIZE (mode)));
    8110        16573 :   if (mode != QImode)
    8111              :     {
    8112        32081 :       destexp = gen_rtx_ASHIFT (Pmode, countreg,
    8113              :                                 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
    8114        10867 :       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
    8115              :     }
    8116              :   else
    8117         5988 :     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
    8118        16573 :   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
    8119              :     {
    8120        11280 :       rounded_count
    8121        11280 :         = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
    8122        11280 :       destmem = shallow_copy_rtx (destmem);
    8123        11280 :       set_mem_size (destmem, rounded_count);
    8124              :     }
    8125         5300 :   else if (MEM_SIZE_KNOWN_P (destmem))
    8126          333 :     clear_mem_size (destmem);
    8127              : 
    8128        16573 :   if (issetmem)
    8129              :     {
    8130         6052 :       value = force_reg (mode, gen_lowpart (mode, value));
    8131         6052 :       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
    8132              :     }
    8133              :   else
    8134              :     {
    8135        10521 :       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
    8136        10319 :         srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
    8137        10521 :       if (mode != QImode)
    8138              :         {
    8139        17800 :           srcexp = gen_rtx_ASHIFT (Pmode, countreg,
    8140              :                                    GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
    8141         6056 :           srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
    8142              :         }
    8143              :       else
    8144         4667 :         srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
    8145        10521 :       if (CONST_INT_P (count))
    8146              :         {
    8147         6369 :           rounded_count
    8148         6369 :             = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
    8149         6369 :           srcmem = shallow_copy_rtx (srcmem);
    8150         6369 :           set_mem_size (srcmem, rounded_count);
    8151              :         }
    8152              :       else
    8153              :         {
    8154         4166 :           if (MEM_SIZE_KNOWN_P (srcmem))
    8155            0 :             clear_mem_size (srcmem);
    8156              :         }
    8157        10521 :       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
    8158              :                               destexp, srcexp));
    8159              :     }
    8160        16573 : }
    8161              : 
    8162              : /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
    8163              :    DESTMEM.
    8164              :    SRC is passed by pointer to be updated on return.
    8165              :    Return value is updated DST.  */
    8166              : static rtx
    8167           13 : emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
    8168              :              HOST_WIDE_INT size_to_move)
    8169              : {
    8170           13 :   rtx dst = destmem, src = *srcmem, tempreg;
    8171           13 :   enum insn_code code;
    8172           13 :   machine_mode move_mode;
    8173           13 :   int piece_size, i;
    8174              : 
    8175              :   /* Find the widest mode in which we could perform moves.
    8176              :      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
    8177              :      it until move of such size is supported.  */
    8178           13 :   piece_size = 1 << floor_log2 (size_to_move);
    8179           26 :   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
    8180           26 :          || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
    8181              :     {
    8182            0 :       gcc_assert (piece_size > 1);
    8183            0 :       piece_size >>= 1;
    8184              :     }
    8185              : 
    8186              :   /* Find the corresponding vector mode with the same size as MOVE_MODE.
    8187              :      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
    8188           39 :   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
    8189              :     {
    8190            0 :       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
    8191            0 :       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
    8192            0 :           || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
    8193              :         {
    8194            0 :           move_mode = word_mode;
    8195            0 :           piece_size = GET_MODE_SIZE (move_mode);
    8196            0 :           code = optab_handler (mov_optab, move_mode);
    8197              :         }
    8198              :     }
    8199           13 :   gcc_assert (code != CODE_FOR_nothing);
    8200              : 
    8201           13 :   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
    8202           13 :   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
    8203              : 
    8204              :   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
    8205           13 :   gcc_assert (size_to_move % piece_size == 0);
    8206              : 
    8207           26 :   for (i = 0; i < size_to_move; i += piece_size)
    8208              :     {
    8209              :       /* We move from memory to memory, so we'll need to do it via
    8210              :          a temporary register.  */
    8211           13 :       tempreg = gen_reg_rtx (move_mode);
    8212           13 :       emit_insn (GEN_FCN (code) (tempreg, src));
    8213           13 :       emit_insn (GEN_FCN (code) (dst, tempreg));
    8214              : 
    8215           26 :       emit_move_insn (destptr,
    8216           13 :                       plus_constant (Pmode, copy_rtx (destptr), piece_size));
    8217           26 :       emit_move_insn (srcptr,
    8218           13 :                       plus_constant (Pmode, copy_rtx (srcptr), piece_size));
    8219              : 
    8220           13 :       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8221              :                                           piece_size);
    8222           13 :       src = adjust_automodify_address_nv (src, move_mode, srcptr,
    8223              :                                           piece_size);
    8224              :     }
    8225              : 
    8226              :   /* Update DST and SRC rtx.  */
    8227           13 :   *srcmem = src;
    8228           13 :   return dst;
    8229              : }
    8230              : 
    8231              : /* Helper function for the string operations below.  Dest VARIABLE whether
    8232              :    it is aligned to VALUE bytes.  If true, jump to the label.  */
    8233              : 
    8234              : static rtx_code_label *
    8235        39025 : ix86_expand_aligntest (rtx variable, int value, bool epilogue)
    8236              : {
    8237        39025 :   rtx_code_label *label = gen_label_rtx ();
    8238        39025 :   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
    8239        39025 :   if (GET_MODE (variable) == DImode)
    8240          897 :     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
    8241              :   else
    8242        38128 :     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
    8243        39025 :   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
    8244              :                            1, label);
    8245        39025 :   if (epilogue)
    8246            3 :     predict_jump (REG_BR_PROB_BASE * 50 / 100);
    8247              :   else
    8248        39022 :     predict_jump (REG_BR_PROB_BASE * 90 / 100);
    8249        39025 :   return label;
    8250              : }
    8251              : 
    8252              : 
    8253              : /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
    8254              : 
    8255              : static void
    8256         8447 : expand_cpymem_epilogue (rtx destmem, rtx srcmem,
    8257              :                         rtx destptr, rtx srcptr, rtx count, int max_size)
    8258              : {
    8259         8447 :   rtx src, dest;
    8260         8447 :   if (CONST_INT_P (count))
    8261              :     {
    8262         6294 :       unsigned HOST_WIDE_INT countval = UINTVAL (count);
    8263         6294 :       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
    8264         6294 :       unsigned int destalign = MEM_ALIGN (destmem);
    8265         6294 :       cfun->machine->by_pieces_in_use = true;
    8266         6294 :       move_by_pieces (destmem, srcmem, epilogue_size, destalign,
    8267              :                       RETURN_BEGIN);
    8268         6294 :       cfun->machine->by_pieces_in_use = false;
    8269         6294 :       return;
    8270              :     }
    8271         2153 :   if (max_size > 8)
    8272              :     {
    8273         2153 :       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
    8274              :                                     count, 1, OPTAB_DIRECT);
    8275         2153 :       expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
    8276              :                                      count, QImode, 1, 4, false);
    8277         2153 :       return;
    8278              :     }
    8279              : 
    8280              :   /* When there are stringops, we can cheaply increase dest and src pointers.
    8281              :      Otherwise we save code size by maintaining offset (zero is readily
    8282              :      available from preceding rep operation) and using x86 addressing modes.
    8283              :    */
    8284            0 :   if (TARGET_SINGLE_STRINGOP)
    8285              :     {
    8286            0 :       if (max_size > 4)
    8287              :         {
    8288            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8289            0 :           src = change_address (srcmem, SImode, srcptr);
    8290            0 :           dest = change_address (destmem, SImode, destptr);
    8291            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8292            0 :           emit_label (label);
    8293            0 :           LABEL_NUSES (label) = 1;
    8294              :         }
    8295            0 :       if (max_size > 2)
    8296              :         {
    8297            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8298            0 :           src = change_address (srcmem, HImode, srcptr);
    8299            0 :           dest = change_address (destmem, HImode, destptr);
    8300            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8301            0 :           emit_label (label);
    8302            0 :           LABEL_NUSES (label) = 1;
    8303              :         }
    8304            0 :       if (max_size > 1)
    8305              :         {
    8306            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8307            0 :           src = change_address (srcmem, QImode, srcptr);
    8308            0 :           dest = change_address (destmem, QImode, destptr);
    8309            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8310            0 :           emit_label (label);
    8311            0 :           LABEL_NUSES (label) = 1;
    8312              :         }
    8313              :     }
    8314              :   else
    8315              :     {
    8316            0 :       rtx offset = force_reg (Pmode, const0_rtx);
    8317            0 :       rtx tmp;
    8318              : 
    8319            0 :       if (max_size > 4)
    8320              :         {
    8321            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8322            0 :           src = change_address (srcmem, SImode, srcptr);
    8323            0 :           dest = change_address (destmem, SImode, destptr);
    8324            0 :           emit_move_insn (dest, src);
    8325            0 :           tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
    8326              :                                      true, OPTAB_LIB_WIDEN);
    8327            0 :           if (tmp != offset)
    8328            0 :             emit_move_insn (offset, tmp);
    8329            0 :           emit_label (label);
    8330            0 :           LABEL_NUSES (label) = 1;
    8331              :         }
    8332            0 :       if (max_size > 2)
    8333              :         {
    8334            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8335            0 :           tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
    8336            0 :           src = change_address (srcmem, HImode, tmp);
    8337            0 :           tmp = gen_rtx_PLUS (Pmode, destptr, offset);
    8338            0 :           dest = change_address (destmem, HImode, tmp);
    8339            0 :           emit_move_insn (dest, src);
    8340            0 :           tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
    8341              :                                      true, OPTAB_LIB_WIDEN);
    8342            0 :           if (tmp != offset)
    8343            0 :             emit_move_insn (offset, tmp);
    8344            0 :           emit_label (label);
    8345            0 :           LABEL_NUSES (label) = 1;
    8346              :         }
    8347            0 :       if (max_size > 1)
    8348              :         {
    8349            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8350            0 :           tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
    8351            0 :           src = change_address (srcmem, QImode, tmp);
    8352            0 :           tmp = gen_rtx_PLUS (Pmode, destptr, offset);
    8353            0 :           dest = change_address (destmem, QImode, tmp);
    8354            0 :           emit_move_insn (dest, src);
    8355            0 :           emit_label (label);
    8356            0 :           LABEL_NUSES (label) = 1;
    8357              :         }
    8358              :     }
    8359              : }
    8360              : 
    8361              : /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
    8362              :    with value PROMOTED_VAL.
    8363              :    SRC is passed by pointer to be updated on return.
    8364              :    Return value is updated DST.  */
    8365              : static rtx
    8366            6 : emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
    8367              :              HOST_WIDE_INT size_to_move)
    8368              : {
    8369            6 :   rtx dst = destmem;
    8370            6 :   enum insn_code code;
    8371            6 :   machine_mode move_mode;
    8372            6 :   int piece_size, i;
    8373              : 
    8374              :   /* Find the widest mode in which we could perform moves.
    8375              :      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
    8376              :      it until move of such size is supported.  */
    8377            6 :   move_mode = GET_MODE (promoted_val);
    8378            6 :   if (move_mode == VOIDmode)
    8379            0 :     move_mode = QImode;
    8380           12 :   if (size_to_move < GET_MODE_SIZE (move_mode))
    8381              :     {
    8382            5 :       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
    8383            5 :       move_mode = int_mode_for_size (move_bits, 0).require ();
    8384            5 :       promoted_val = gen_lowpart (move_mode, promoted_val);
    8385              :     }
    8386            6 :   piece_size = GET_MODE_SIZE (move_mode);
    8387            6 :   code = optab_handler (mov_optab, move_mode);
    8388            6 :   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
    8389              : 
    8390            6 :   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
    8391              : 
    8392              :   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
    8393            6 :   gcc_assert (size_to_move % piece_size == 0);
    8394              : 
    8395           12 :   for (i = 0; i < size_to_move; i += piece_size)
    8396              :     {
    8397           12 :       if (piece_size <= GET_MODE_SIZE (word_mode))
    8398              :         {
    8399            4 :           emit_insn (gen_strset (destptr, dst, promoted_val));
    8400            4 :           dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8401              :                                               piece_size);
    8402            4 :           continue;
    8403              :         }
    8404              : 
    8405            2 :       emit_insn (GEN_FCN (code) (dst, promoted_val));
    8406              : 
    8407            4 :       emit_move_insn (destptr,
    8408            2 :                       plus_constant (Pmode, copy_rtx (destptr), piece_size));
    8409              : 
    8410            2 :       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8411              :                                           piece_size);
    8412              :     }
    8413              : 
    8414              :   /* Update DST rtx.  */
    8415            6 :   return dst;
    8416              : }
    8417              : /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
    8418              : static void
    8419          311 : expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
    8420              :                                  rtx count, int max_size)
    8421              : {
    8422          622 :   count = expand_simple_binop (counter_mode (count), AND, count,
    8423          311 :                                GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
    8424          311 :   expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
    8425          311 :                                  gen_lowpart (QImode, value), count, QImode,
    8426              :                                  1, max_size / 2, true);
    8427          311 : }
    8428              : 
    8429              : /* Callback routine for store_by_pieces.  Return the RTL of a register
    8430              :    containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
    8431              :    is an integer or a word vector register.  If PREV_P isn't nullptr,
    8432              :    it has the RTL info from the previous iteration.  */
    8433              : 
    8434              : static rtx
    8435         4999 : setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
    8436              :                          fixed_size_mode mode)
    8437              : {
    8438         4999 :   rtx target;
    8439         4999 :   by_pieces_prev *prev = (by_pieces_prev *) prev_p;
    8440         4999 :   if (prev)
    8441              :     {
    8442         4999 :       rtx prev_op = prev->data;
    8443         4999 :       if (prev_op)
    8444              :         {
    8445         2893 :           machine_mode prev_mode = GET_MODE (prev_op);
    8446         2893 :           if (prev_mode == mode)
    8447              :             return prev_op;
    8448           54 :           if (VECTOR_MODE_P (prev_mode)
    8449         1097 :               && VECTOR_MODE_P (mode)
    8450         1151 :               && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
    8451              :             {
    8452            0 :               target = gen_rtx_SUBREG (mode, prev_op, 0);
    8453            0 :               return target;
    8454              :             }
    8455              :         }
    8456              :     }
    8457              : 
    8458         3257 :   rtx op = (rtx) op_p;
    8459         3257 :   machine_mode op_mode = GET_MODE (op);
    8460              : 
    8461         3257 :   if (VECTOR_MODE_P (mode))
    8462              :     {
    8463         3684 :       gcc_assert (GET_MODE_INNER (mode) == QImode);
    8464              : 
    8465         1842 :       unsigned int op_size = GET_MODE_SIZE (op_mode);
    8466         1842 :       unsigned int size = GET_MODE_SIZE (mode);
    8467         1842 :       unsigned int nunits;
    8468         1842 :       machine_mode vec_mode;
    8469         1842 :       if (op_size < size)
    8470              :         {
    8471              :           /* If OP size is smaller than MODE size, duplicate it.  */
    8472            1 :           nunits = size / GET_MODE_SIZE (QImode);
    8473            1 :           vec_mode = mode_for_vector (QImode, nunits).require ();
    8474            1 :           nunits = size / op_size;
    8475            1 :           gcc_assert (SCALAR_INT_MODE_P (op_mode));
    8476            1 :           machine_mode dup_mode
    8477            1 :             = mode_for_vector (as_a <scalar_mode> (op_mode),
    8478            2 :                                nunits).require ();
    8479            1 :           target = gen_reg_rtx (vec_mode);
    8480            1 :           op = gen_vec_duplicate (dup_mode, op);
    8481            1 :           rtx dup_op = gen_reg_rtx (dup_mode);
    8482            1 :           emit_move_insn (dup_op, op);
    8483            1 :           op = gen_rtx_SUBREG (vec_mode, dup_op, 0);
    8484            1 :           emit_move_insn (target, op);
    8485            1 :           return target;
    8486              :         }
    8487         1841 :       nunits = op_size / GET_MODE_SIZE (QImode);
    8488         1841 :       vec_mode = mode_for_vector (QImode, nunits).require ();
    8489         1841 :       target = gen_reg_rtx (vec_mode);
    8490         1841 :       op = gen_rtx_SUBREG (vec_mode, op, 0);
    8491         1841 :       emit_move_insn (target, op);
    8492         1841 :       if (op_size == size)
    8493              :         return target;
    8494              : 
    8495            0 :       rtx tmp = gen_reg_rtx (mode);
    8496            0 :       target = gen_rtx_SUBREG (mode, target, 0);
    8497            0 :       emit_move_insn (tmp, target);
    8498            0 :       return tmp;
    8499              :     }
    8500              : 
    8501         1415 :   if (VECTOR_MODE_P (op_mode))
    8502              :     {
    8503         2820 :       gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
    8504         1410 :       target = gen_reg_rtx (word_mode);
    8505         1410 :       op = gen_rtx_SUBREG (word_mode, op, 0);
    8506         1410 :       emit_move_insn (target, op);
    8507              :     }
    8508              :   else
    8509              :     target = op;
    8510              : 
    8511         1415 :   if (mode == GET_MODE (target))
    8512              :     return target;
    8513              : 
    8514          241 :   rtx tmp = gen_reg_rtx (mode);
    8515          241 :   target = gen_rtx_SUBREG (mode, target, 0);
    8516          241 :   emit_move_insn (tmp, target);
    8517          241 :   return tmp;
    8518              : }
    8519              : 
    8520              : /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
    8521              : static void
    8522         8039 : expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
    8523              :                         rtx count, int max_size)
    8524              : {
    8525         8039 :   rtx dest;
    8526              : 
    8527         8039 :   if (CONST_INT_P (count))
    8528              :     {
    8529         7727 :       unsigned HOST_WIDE_INT countval = UINTVAL (count);
    8530         7727 :       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
    8531         7727 :       unsigned int destalign = MEM_ALIGN (destmem);
    8532         7727 :       cfun->machine->by_pieces_in_use = true;
    8533        12413 :       store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
    8534              :                        vec_value ? vec_value : value, destalign, true,
    8535              :                        RETURN_BEGIN);
    8536         7727 :       cfun->machine->by_pieces_in_use = false;
    8537         7727 :       return;
    8538              :     }
    8539          312 :   if (max_size > 32)
    8540              :     {
    8541          311 :       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
    8542          311 :       return;
    8543              :     }
    8544            1 :   if (max_size > 16)
    8545              :     {
    8546            0 :       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
    8547            0 :       if (TARGET_64BIT)
    8548              :         {
    8549            0 :           dest = change_address (destmem, DImode, destptr);
    8550            0 :           emit_insn (gen_strset (destptr, dest, value));
    8551            0 :           dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
    8552            0 :           emit_insn (gen_strset (destptr, dest, value));
    8553              :         }
    8554              :       else
    8555              :         {
    8556            0 :           dest = change_address (destmem, SImode, destptr);
    8557            0 :           emit_insn (gen_strset (destptr, dest, value));
    8558            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
    8559            0 :           emit_insn (gen_strset (destptr, dest, value));
    8560            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
    8561            0 :           emit_insn (gen_strset (destptr, dest, value));
    8562            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
    8563            0 :           emit_insn (gen_strset (destptr, dest, value));
    8564              :         }
    8565            0 :       emit_label (label);
    8566            0 :       LABEL_NUSES (label) = 1;
    8567              :     }
    8568            1 :   if (max_size > 8)
    8569              :     {
    8570            0 :       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
    8571            0 :       if (TARGET_64BIT)
    8572              :         {
    8573            0 :           dest = change_address (destmem, DImode, destptr);
    8574            0 :           emit_insn (gen_strset (destptr, dest, value));
    8575              :         }
    8576              :       else
    8577              :         {
    8578            0 :           dest = change_address (destmem, SImode, destptr);
    8579            0 :           emit_insn (gen_strset (destptr, dest, value));
    8580            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
    8581            0 :           emit_insn (gen_strset (destptr, dest, value));
    8582              :         }
    8583            0 :       emit_label (label);
    8584            0 :       LABEL_NUSES (label) = 1;
    8585              :     }
    8586            1 :   if (max_size > 4)
    8587              :     {
    8588            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8589            1 :       dest = change_address (destmem, SImode, destptr);
    8590            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
    8591            1 :       emit_label (label);
    8592            1 :       LABEL_NUSES (label) = 1;
    8593              :     }
    8594            1 :   if (max_size > 2)
    8595              :     {
    8596            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8597            1 :       dest = change_address (destmem, HImode, destptr);
    8598            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
    8599            1 :       emit_label (label);
    8600            1 :       LABEL_NUSES (label) = 1;
    8601              :     }
    8602            1 :   if (max_size > 1)
    8603              :     {
    8604            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8605            1 :       dest = change_address (destmem, QImode, destptr);
    8606            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
    8607            1 :       emit_label (label);
    8608            1 :       LABEL_NUSES (label) = 1;
    8609              :     }
    8610              : }
    8611              : 
    8612              : /* Adjust COUNTER by the VALUE.  */
    8613              : static void
    8614           19 : ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
    8615              : {
    8616           19 :   emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
    8617           19 : }
    8618              : 
    8619              : /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
    8620              :    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
    8621              :    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
    8622              :    ignored.
    8623              :    Return value is updated DESTMEM.  */
    8624              : 
    8625              : static rtx
    8626            7 : expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
    8627              :                                   rtx destptr, rtx srcptr, rtx value,
    8628              :                                   rtx vec_value, rtx count, int align,
    8629              :                                   int desired_alignment, bool issetmem)
    8630              : {
    8631            7 :   int i;
    8632           35 :   for (i = 1; i < desired_alignment; i <<= 1)
    8633              :     {
    8634           28 :       if (align <= i)
    8635              :         {
    8636           19 :           rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
    8637           19 :           if (issetmem)
    8638              :             {
    8639           12 :               if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
    8640            2 :                 destmem = emit_memset (destmem, destptr, vec_value, i);
    8641              :               else
    8642            4 :                 destmem = emit_memset (destmem, destptr, value, i);
    8643              :             }
    8644              :           else
    8645           13 :             destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
    8646           19 :           ix86_adjust_counter (count, i);
    8647           19 :           emit_label (label);
    8648           19 :           LABEL_NUSES (label) = 1;
    8649           19 :           set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
    8650              :         }
    8651              :     }
    8652            7 :   return destmem;
    8653              : }
    8654              : 
    8655              : /* Test if COUNT&SIZE is nonzero and if so, expand movme
    8656              :    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
    8657              :    and jump to DONE_LABEL.  */
    8658              : static void
    8659        31198 : expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
    8660              :                                rtx destptr, rtx srcptr,
    8661              :                                rtx value, rtx vec_value,
    8662              :                                rtx count, int size,
    8663              :                                rtx done_label, bool issetmem)
    8664              : {
    8665        31198 :   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
    8666        31198 :   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
    8667        31198 :   rtx modesize;
    8668        31198 :   rtx scalar_value = value;
    8669        31198 :   int n;
    8670              : 
    8671              :   /* If we do not have vector value to copy, we must reduce size.  */
    8672        31198 :   if (issetmem)
    8673              :     {
    8674         3592 :       if (!vec_value)
    8675              :         {
    8676            7 :           if (GET_MODE (value) == VOIDmode && size > 8)
    8677            0 :             mode = Pmode;
    8678           21 :           else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
    8679            1 :             mode = GET_MODE (value);
    8680              :         }
    8681              :       else
    8682         3585 :         mode = GET_MODE (vec_value), value = vec_value;
    8683              :     }
    8684              :   else
    8685              :     {
    8686              :       /* Choose appropriate vector mode.  */
    8687        27606 :       if (size >= 32)
    8688         6900 :         switch (MOVE_MAX)
    8689              :           {
    8690            0 :           case 64:
    8691            0 :             if (size >= 64)
    8692              :               {
    8693              :                 mode = V64QImode;
    8694              :                 break;
    8695              :               }
    8696              :             /* FALLTHRU */
    8697            0 :           case 32:
    8698            0 :             mode = V32QImode;
    8699            0 :             break;
    8700              :           case 16:
    8701              :             mode = V16QImode;
    8702              :             break;
    8703              :           case 8:
    8704              :             mode = DImode;
    8705              :             break;
    8706            0 :           default:
    8707            0 :             gcc_unreachable ();
    8708              :           }
    8709        20706 :       else if (size >= 16)
    8710         6900 :         mode = TARGET_SSE ? V16QImode : DImode;
    8711        27606 :       srcmem = change_address (srcmem, mode, srcptr);
    8712              :     }
    8713        34783 :   if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
    8714              :     {
    8715              :       /* For memset with vector and the size is smaller than the vector
    8716              :          size, first try the narrower vector, otherwise, use the
    8717              :          original value. */
    8718         1797 :       machine_mode inner_mode = GET_MODE_INNER (mode);
    8719         1797 :       unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
    8720         1797 :       if (nunits > 1)
    8721              :         {
    8722          316 :           mode = mode_for_vector (GET_MODE_INNER (mode),
    8723          316 :                                   nunits).require ();
    8724          158 :           value = gen_rtx_SUBREG (mode, value, 0);
    8725              :         }
    8726              :       else
    8727              :         {
    8728         1639 :           scalar_int_mode smode
    8729         1639 :             = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
    8730         4917 :           gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
    8731              :                       >= GET_MODE_SIZE (smode));
    8732         1639 :           mode = smode;
    8733         1639 :           if (GET_MODE (scalar_value) == mode)
    8734              :             value = scalar_value;
    8735              :           else
    8736          745 :             value = gen_rtx_SUBREG (mode, scalar_value, 0);
    8737              :         }
    8738              :     }
    8739        31198 :   destmem = change_address (destmem, mode, destptr);
    8740        62396 :   modesize = GEN_INT (GET_MODE_SIZE (mode));
    8741        62396 :   gcc_assert (GET_MODE_SIZE (mode) <= size);
    8742       140382 :   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
    8743              :     {
    8744        38993 :       if (issetmem)
    8745         4487 :         emit_move_insn (destmem, gen_lowpart (mode, value));
    8746              :       else
    8747              :         {
    8748        34506 :           emit_move_insn (destmem, srcmem);
    8749        69012 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8750              :         }
    8751        77986 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8752              :     }
    8753              : 
    8754        31198 :   destmem = offset_address (destmem, count, 1);
    8755        62396 :   destmem = offset_address (destmem, GEN_INT (-2 * size),
    8756        31198 :                             GET_MODE_SIZE (mode));
    8757        31198 :   if (!issetmem)
    8758              :     {
    8759        27606 :       srcmem = offset_address (srcmem, count, 1);
    8760        55212 :       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
    8761        27606 :                                GET_MODE_SIZE (mode));
    8762              :     }
    8763       140382 :   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
    8764              :     {
    8765        38993 :       if (issetmem)
    8766         4487 :         emit_move_insn (destmem, gen_lowpart (mode, value));
    8767              :       else
    8768              :         {
    8769        34506 :           emit_move_insn (destmem, srcmem);
    8770        69012 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8771              :         }
    8772        77986 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8773              :     }
    8774        31198 :   emit_jump_insn (gen_jump (done_label));
    8775        31198 :   emit_barrier ();
    8776              : 
    8777        31198 :   emit_label (label);
    8778        31198 :   LABEL_NUSES (label) = 1;
    8779        31198 : }
    8780              : 
    8781              : /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
    8782              :    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
    8783              :    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
    8784              :    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
    8785              :    DONE_LABEL is a label after the whole copying sequence. The label is created
    8786              :    on demand if *DONE_LABEL is NULL.
    8787              :    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
    8788              :    bounds after the initial copies.
    8789              : 
    8790              :    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
    8791              :    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
    8792              :    we will dispatch to a library call for large blocks.
    8793              : 
    8794              :    In pseudocode we do:
    8795              : 
    8796              :    if (COUNT < SIZE)
    8797              :      {
    8798              :        Assume that SIZE is 4. Bigger sizes are handled analogously
    8799              :        if (COUNT & 4)
    8800              :          {
    8801              :             copy 4 bytes from SRCPTR to DESTPTR
    8802              :             copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
    8803              :             goto done_label
    8804              :          }
    8805              :        if (!COUNT)
    8806              :          goto done_label;
    8807              :        copy 1 byte from SRCPTR to DESTPTR
    8808              :        if (COUNT & 2)
    8809              :          {
    8810              :             copy 2 bytes from SRCPTR to DESTPTR
    8811              :             copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
    8812              :          }
    8813              :      }
    8814              :    else
    8815              :      {
    8816              :        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
    8817              :        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
    8818              : 
    8819              :        OLD_DESPTR = DESTPTR;
    8820              :        Align DESTPTR up to DESIRED_ALIGN
    8821              :        SRCPTR += DESTPTR - OLD_DESTPTR
    8822              :        COUNT -= DEST_PTR - OLD_DESTPTR
    8823              :        if (DYNAMIC_CHECK)
    8824              :          Round COUNT down to multiple of SIZE
    8825              :        << optional caller supplied zero size guard is here >>
    8826              :        << optional caller supplied dynamic check is here >>
    8827              :        << caller supplied main copy loop is here >>
    8828              :      }
    8829              :    done_label:
    8830              :   */
    8831              : static void
    8832        11142 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
    8833              :                                                             rtx *destptr, rtx *srcptr,
    8834              :                                                             machine_mode mode,
    8835              :                                                             rtx value, rtx vec_value,
    8836              :                                                             rtx *count,
    8837              :                                                             rtx_code_label **done_label,
    8838              :                                                             int size,
    8839              :                                                             int desired_align,
    8840              :                                                             int align,
    8841              :                                                             unsigned HOST_WIDE_INT *min_size,
    8842              :                                                             bool dynamic_check,
    8843              :                                                             bool issetmem)
    8844              : {
    8845        11142 :   rtx_code_label *loop_label = NULL, *label;
    8846        11142 :   int n;
    8847        11142 :   rtx modesize;
    8848        11142 :   int prolog_size = 0;
    8849        11142 :   rtx mode_value;
    8850              : 
    8851              :   /* Chose proper value to copy.  */
    8852        11142 :   if (issetmem && VECTOR_MODE_P (mode))
    8853              :     mode_value = vec_value;
    8854              :   else
    8855        11142 :     mode_value = value;
    8856        22284 :   gcc_assert (GET_MODE_SIZE (mode) <= size);
    8857              : 
    8858              :   /* See if block is big or small, handle small blocks.  */
    8859        11142 :   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
    8860              :     {
    8861         7805 :       int size2 = size;
    8862         7805 :       loop_label = gen_label_rtx ();
    8863              : 
    8864         7805 :       if (!*done_label)
    8865         7805 :         *done_label = gen_label_rtx ();
    8866              : 
    8867         7805 :       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
    8868              :                                1, loop_label);
    8869         7805 :       size2 >>= 1;
    8870              : 
    8871              :       /* Handle sizes > 3.  */
    8872        39003 :       for (;size2 > 2; size2 >>= 1)
    8873        31198 :         expand_small_cpymem_or_setmem (destmem, srcmem,
    8874              :                                        *destptr, *srcptr,
    8875              :                                        value, vec_value,
    8876              :                                        *count,
    8877              :                                        size2, *done_label, issetmem);
    8878              :       /* Nothing to copy?  Jump to DONE_LABEL if so */
    8879         7805 :       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
    8880              :                                1, *done_label);
    8881              : 
    8882              :       /* Do a byte copy.  */
    8883         7805 :       destmem = change_address (destmem, QImode, *destptr);
    8884         7805 :       if (issetmem)
    8885          899 :         emit_move_insn (destmem, gen_lowpart (QImode, value));
    8886              :       else
    8887              :         {
    8888         6906 :           srcmem = change_address (srcmem, QImode, *srcptr);
    8889         6906 :           emit_move_insn (destmem, srcmem);
    8890              :         }
    8891              : 
    8892              :       /* Handle sizes 2 and 3.  */
    8893         7805 :       label = ix86_expand_aligntest (*count, 2, false);
    8894         7805 :       destmem = change_address (destmem, HImode, *destptr);
    8895         7805 :       destmem = offset_address (destmem, *count, 1);
    8896         7805 :       destmem = offset_address (destmem, GEN_INT (-2), 2);
    8897         7805 :       if (issetmem)
    8898          899 :         emit_move_insn (destmem, gen_lowpart (HImode, value));
    8899              :       else
    8900              :         {
    8901         6906 :           srcmem = change_address (srcmem, HImode, *srcptr);
    8902         6906 :           srcmem = offset_address (srcmem, *count, 1);
    8903         6906 :           srcmem = offset_address (srcmem, GEN_INT (-2), 2);
    8904         6906 :           emit_move_insn (destmem, srcmem);
    8905              :         }
    8906              : 
    8907         7805 :       emit_label (label);
    8908         7805 :       LABEL_NUSES (label) = 1;
    8909         7805 :       emit_jump_insn (gen_jump (*done_label));
    8910         7805 :       emit_barrier ();
    8911              :     }
    8912              :   else
    8913         3337 :     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
    8914              :                 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
    8915              : 
    8916              :   /* Start memcpy for COUNT >= SIZE.  */
    8917         7805 :   if (loop_label)
    8918              :     {
    8919         7805 :        emit_label (loop_label);
    8920         7805 :        LABEL_NUSES (loop_label) = 1;
    8921              :     }
    8922              : 
    8923              :   /* Copy first desired_align bytes.  */
    8924        11142 :   if (!issetmem)
    8925         8549 :     srcmem = change_address (srcmem, mode, *srcptr);
    8926        11142 :   destmem = change_address (destmem, mode, *destptr);
    8927        11142 :   modesize = GEN_INT (GET_MODE_SIZE (mode));
    8928        22305 :   for (n = 0; prolog_size < desired_align - align; n++)
    8929              :     {
    8930           21 :       if (issetmem)
    8931            3 :         emit_move_insn (destmem, mode_value);
    8932              :       else
    8933              :         {
    8934           18 :           emit_move_insn (destmem, srcmem);
    8935           36 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8936              :         }
    8937           42 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8938           42 :       prolog_size += GET_MODE_SIZE (mode);
    8939              :     }
    8940              : 
    8941              : 
    8942              :   /* Copy last SIZE bytes.  */
    8943        11142 :   destmem = offset_address (destmem, *count, 1);
    8944        11142 :   destmem = offset_address (destmem,
    8945        11142 :                             GEN_INT (-size - prolog_size),
    8946              :                             1);
    8947        11142 :   if (issetmem)
    8948         2593 :     emit_move_insn (destmem, mode_value);
    8949              :   else
    8950              :     {
    8951         8549 :       srcmem = offset_address (srcmem, *count, 1);
    8952         8549 :       srcmem = offset_address (srcmem,
    8953              :                                GEN_INT (-size - prolog_size),
    8954              :                                1);
    8955         8549 :       emit_move_insn (destmem, srcmem);
    8956              :     }
    8957        87504 :   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
    8958              :     {
    8959        32610 :       destmem = offset_address (destmem, modesize, 1);
    8960        32610 :       if (issetmem)
    8961         7587 :         emit_move_insn (destmem, mode_value);
    8962              :       else
    8963              :         {
    8964        25023 :           srcmem = offset_address (srcmem, modesize, 1);
    8965        25023 :           emit_move_insn (destmem, srcmem);
    8966              :         }
    8967              :     }
    8968              : 
    8969              :   /* Align destination.  */
    8970        11142 :   if (desired_align > 1 && desired_align > align)
    8971              :     {
    8972           21 :       rtx saveddest = *destptr;
    8973              : 
    8974           21 :       gcc_assert (desired_align <= size);
    8975              :       /* Align destptr up, place it to new register.  */
    8976           21 :       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
    8977              :                                       GEN_INT (prolog_size),
    8978              :                                       NULL_RTX, 1, OPTAB_DIRECT);
    8979           21 :       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
    8980           21 :         REG_POINTER (*destptr) = 1;
    8981           21 :       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
    8982           21 :                                       GEN_INT (-desired_align),
    8983              :                                       *destptr, 1, OPTAB_DIRECT);
    8984              :       /* See how many bytes we skipped.  */
    8985           21 :       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
    8986              :                                        *destptr,
    8987              :                                        NULL_RTX, 1, OPTAB_DIRECT);
    8988              :       /* Adjust srcptr and count.  */
    8989           21 :       if (!issetmem)
    8990           18 :         *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
    8991              :                                        saveddest, *srcptr, 1, OPTAB_DIRECT);
    8992           21 :       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
    8993              :                                     saveddest, *count, 1, OPTAB_DIRECT);
    8994              :       /* We copied at most size + prolog_size.  */
    8995           21 :       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
    8996           14 :         *min_size
    8997           14 :           = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
    8998              :       else
    8999            7 :         *min_size = 0;
    9000              : 
    9001              :       /* Our loops always round down the block size, but for dispatch to
    9002              :          library we need precise value.  */
    9003           21 :       if (dynamic_check)
    9004           21 :         *count = expand_simple_binop (GET_MODE (*count), AND, *count,
    9005              :                                       GEN_INT (-size), *count, 1, OPTAB_DIRECT);
    9006              :     }
    9007              :   else
    9008              :     {
    9009        11121 :       gcc_assert (prolog_size == 0);
    9010              :       /* Decrease count, so we won't end up copying last word twice.  */
    9011        11121 :       if (!CONST_INT_P (*count))
    9012         7805 :         *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
    9013              :                                       constm1_rtx, *count, 1, OPTAB_DIRECT);
    9014              :       else
    9015         3316 :         *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
    9016              :                                       (unsigned HOST_WIDE_INT)size));
    9017        11121 :       if (*min_size)
    9018         9936 :         *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
    9019              :     }
    9020        11142 : }
    9021              : 
    9022              : 
    9023              : /* This function is like the previous one, except here we know how many bytes
    9024              :    need to be copied.  That allows us to update alignment not only of DST, which
    9025              :    is returned, but also of SRC, which is passed as a pointer for that
    9026              :    reason.  */
    9027              : static rtx
    9028            0 : expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
    9029              :                                            rtx srcreg, rtx value, rtx vec_value,
    9030              :                                            int desired_align, int align_bytes,
    9031              :                                            bool issetmem)
    9032              : {
    9033            0 :   rtx src = NULL;
    9034            0 :   rtx orig_dst = dst;
    9035            0 :   rtx orig_src = NULL;
    9036            0 :   int piece_size = 1;
    9037            0 :   int copied_bytes = 0;
    9038              : 
    9039            0 :   if (!issetmem)
    9040              :     {
    9041            0 :       gcc_assert (srcp != NULL);
    9042            0 :       src = *srcp;
    9043            0 :       orig_src = src;
    9044              :     }
    9045              : 
    9046            0 :   for (piece_size = 1;
    9047            0 :        piece_size <= desired_align && copied_bytes < align_bytes;
    9048            0 :        piece_size <<= 1)
    9049              :     {
    9050            0 :       if (align_bytes & piece_size)
    9051              :         {
    9052            0 :           if (issetmem)
    9053              :             {
    9054            0 :               if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
    9055            0 :                 dst = emit_memset (dst, destreg, vec_value, piece_size);
    9056              :               else
    9057            0 :                 dst = emit_memset (dst, destreg, value, piece_size);
    9058              :             }
    9059              :           else
    9060            0 :             dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
    9061            0 :           copied_bytes += piece_size;
    9062              :         }
    9063              :     }
    9064            0 :   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
    9065            0 :     set_mem_align (dst, desired_align * BITS_PER_UNIT);
    9066            0 :   if (MEM_SIZE_KNOWN_P (orig_dst))
    9067            0 :     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
    9068              : 
    9069            0 :   if (!issetmem)
    9070              :     {
    9071            0 :       int src_align_bytes = get_mem_align_offset (src, desired_align
    9072              :                                                        * BITS_PER_UNIT);
    9073            0 :       if (src_align_bytes >= 0)
    9074            0 :         src_align_bytes = desired_align - src_align_bytes;
    9075            0 :       if (src_align_bytes >= 0)
    9076              :         {
    9077              :           unsigned int src_align;
    9078            0 :           for (src_align = desired_align; src_align >= 2; src_align >>= 1)
    9079              :             {
    9080            0 :               if ((src_align_bytes & (src_align - 1))
    9081            0 :                    == (align_bytes & (src_align - 1)))
    9082              :                 break;
    9083              :             }
    9084            0 :           if (src_align > (unsigned int) desired_align)
    9085              :             src_align = desired_align;
    9086            0 :           if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
    9087            0 :             set_mem_align (src, src_align * BITS_PER_UNIT);
    9088              :         }
    9089            0 :       if (MEM_SIZE_KNOWN_P (orig_src))
    9090            0 :         set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
    9091            0 :       *srcp = src;
    9092              :     }
    9093              : 
    9094            0 :   return dst;
    9095              : }
    9096              : 
    9097              : /* Return true if ALG can be used in current context.
    9098              :    Assume we expand memset if MEMSET is true.  */
    9099              : static bool
    9100       844496 : alg_usable_p (enum stringop_alg alg, bool memset,
    9101              :               addr_space_t dst_as, addr_space_t src_as)
    9102              : {
    9103       844496 :   if (alg == no_stringop)
    9104              :     return false;
    9105              :   /* It is not possible to use a library call if we have non-default
    9106              :      address space.  We can do better than the generic byte-at-a-time
    9107              :      loop, used as a fallback.  */
    9108       844496 :   if (alg == libcall &&
    9109       473212 :       !(ADDR_SPACE_GENERIC_P (dst_as) && ADDR_SPACE_GENERIC_P (src_as)))
    9110              :     return false;
    9111       844489 :   if (alg == vector_loop)
    9112       373071 :     return TARGET_SSE || TARGET_AVX;
    9113              :   /* Algorithms using the rep prefix want at least edi and ecx;
    9114              :      additionally, memset wants eax and memcpy wants esi.  Don't
    9115              :      consider such algorithms if the user has appropriated those
    9116              :      registers for their own purposes, or if we have the destination
    9117              :      in the non-default address space, since string insns cannot
    9118              :      override the destination segment.  */
    9119       657922 :   if (alg == rep_prefix_1_byte
    9120              :       || alg == rep_prefix_4_byte
    9121       657922 :       || alg == rep_prefix_8_byte)
    9122              :     {
    9123        34977 :       if (fixed_regs[CX_REG]
    9124        34973 :           || fixed_regs[DI_REG]
    9125        34969 :           || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])
    9126        34965 :           || !ADDR_SPACE_GENERIC_P (dst_as)
    9127        69942 :           || !(ADDR_SPACE_GENERIC_P (src_as) || Pmode == word_mode))
    9128           12 :         return false;
    9129              :     }
    9130              :   return true;
    9131              : }
    9132              : 
    9133              : /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
    9134              : static enum stringop_alg
    9135       167127 : decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
    9136              :             unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
    9137              :             bool memset, bool zero_memset, addr_space_t dst_as,
    9138              :             addr_space_t src_as, int *dynamic_check, bool *noalign, bool recur)
    9139              : {
    9140       167127 :   const struct stringop_algs *algs;
    9141       167127 :   bool optimize_for_speed;
    9142       167127 :   int max = 0;
    9143       167127 :   const struct processor_costs *cost;
    9144       167127 :   int i;
    9145       167127 :   bool any_alg_usable_p = false;
    9146              : 
    9147       167127 :   *noalign = false;
    9148       167127 :   *dynamic_check = -1;
    9149              : 
    9150              :   /* Even if the string operation call is cold, we still might spend a lot
    9151              :      of time processing large blocks.  */
    9152       167127 :   if (optimize_function_for_size_p (cfun)
    9153       167127 :       || (optimize_insn_for_size_p ()
    9154        10210 :           && (max_size < 256
    9155         3753 :               || (expected_size != -1 && expected_size < 256))))
    9156              :     optimize_for_speed = false;
    9157              :   else
    9158       149694 :     optimize_for_speed = true;
    9159              : 
    9160       149694 :   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
    9161       167127 :   if (memset)
    9162        49155 :     algs = &cost->memset[TARGET_64BIT != 0];
    9163              :   else
    9164       126866 :     algs = &cost->memcpy[TARGET_64BIT != 0];
    9165              : 
    9166              :   /* See maximal size for user defined algorithm.  */
    9167       835635 :   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
    9168              :     {
    9169       668508 :       enum stringop_alg candidate = algs->size[i].alg;
    9170       668508 :       bool usable = alg_usable_p (candidate, memset, dst_as, src_as);
    9171       668508 :       any_alg_usable_p |= usable;
    9172              : 
    9173       668508 :       if (candidate != libcall && candidate && usable)
    9174       316754 :         max = algs->size[i].max;
    9175              :     }
    9176              : 
    9177              :   /* If expected size is not known but max size is small enough
    9178              :      so inline version is a win, set expected size into
    9179              :      the range.  */
    9180       167127 :   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
    9181        37261 :       && expected_size == -1)
    9182        19542 :     expected_size = min_size / 2 + max_size / 2;
    9183              : 
    9184              :   /* If user specified the algorithm, honor it if possible.  */
    9185       167127 :   if (ix86_stringop_alg != no_stringop
    9186       167127 :       && alg_usable_p (ix86_stringop_alg, memset, dst_as, src_as))
    9187              :     return ix86_stringop_alg;
    9188              :   /* rep; movq or rep; movl is the smallest variant.  */
    9189       167015 :   else if (!optimize_for_speed)
    9190              :     {
    9191        17350 :       *noalign = true;
    9192        17350 :       if (!count || (count & 3) || (memset && !zero_memset))
    9193         6664 :         return alg_usable_p (rep_prefix_1_byte, memset, dst_as, src_as)
    9194         6664 :                ? rep_prefix_1_byte : loop_1_byte;
    9195              :       else
    9196        10686 :         return alg_usable_p (rep_prefix_4_byte, memset, dst_as, src_as)
    9197        10686 :                ? rep_prefix_4_byte : loop;
    9198              :     }
    9199              :   /* Very tiny blocks are best handled via the loop, REP is expensive to
    9200              :      setup.  */
    9201       149665 :   else if (expected_size != -1 && expected_size < 4)
    9202              :     return loop_1_byte;
    9203       146759 :   else if (expected_size != -1)
    9204              :     {
    9205              :       enum stringop_alg alg = libcall;
    9206              :       bool alg_noalign = false;
    9207       182271 :       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
    9208              :         {
    9209              :           /* We get here if the algorithms that were not libcall-based
    9210              :              were rep-prefix based and we are unable to use rep prefixes
    9211              :              based on global register usage.  Break out of the loop and
    9212              :              use the heuristic below.  */
    9213       179338 :           if (algs->size[i].max == 0)
    9214              :             break;
    9215       179338 :           if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
    9216              :             {
    9217        75802 :               enum stringop_alg candidate = algs->size[i].alg;
    9218              : 
    9219        75802 :               if (candidate != libcall
    9220        75802 :                   && alg_usable_p (candidate, memset, dst_as, src_as))
    9221              :                 {
    9222        21077 :                   alg = candidate;
    9223        21077 :                   alg_noalign = algs->size[i].noalign;
    9224              :                 }
    9225              :               /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
    9226              :                  last non-libcall inline algorithm.  */
    9227        75802 :               if (TARGET_INLINE_ALL_STRINGOPS)
    9228              :                 {
    9229              :                   /* When the current size is best to be copied by a libcall,
    9230              :                      but we are still forced to inline, run the heuristic below
    9231              :                      that will pick code for medium sized blocks.  */
    9232        10992 :                   if (alg != libcall)
    9233              :                     {
    9234         5117 :                       *noalign = alg_noalign;
    9235         5117 :                       return alg;
    9236              :                     }
    9237         5875 :                   else if (!any_alg_usable_p)
    9238              :                     break;
    9239              :                 }
    9240        64810 :               else if (alg_usable_p (candidate, memset, dst_as, src_as)
    9241        64810 :                        && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
    9242           22 :                             && candidate == rep_prefix_1_byte
    9243              :                             /* NB: If min_size != max_size, size is
    9244              :                                unknown.  */
    9245           22 :                             && min_size != max_size))
    9246              :                 {
    9247        64791 :                   *noalign = algs->size[i].noalign;
    9248        64791 :                   return candidate;
    9249              :                 }
    9250              :             }
    9251              :         }
    9252              :     }
    9253              :   /* When asked to inline the call anyway, try to pick meaningful choice.
    9254              :      We look for maximal size of block that is faster to copy by hand and
    9255              :      take blocks of at most of that size guessing that average size will
    9256              :      be roughly half of the block.
    9257              : 
    9258              :      If this turns out to be bad, we might simply specify the preferred
    9259              :      choice in ix86_costs.  */
    9260        72622 :   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
    9261        76857 :       && (algs->unknown_size == libcall
    9262            0 :           || !alg_usable_p (algs->unknown_size, memset, dst_as, src_as)))
    9263              :     {
    9264         4235 :       enum stringop_alg alg;
    9265         4235 :       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
    9266              : 
    9267              :       /* If there aren't any usable algorithms or if recursing already,
    9268              :          then recursing on smaller sizes or same size isn't going to
    9269              :          find anything.  Just return the simple byte-at-a-time copy loop.  */
    9270         4235 :       if (!any_alg_usable_p || recur)
    9271              :         {
    9272              :           /* Pick something reasonable.  */
    9273            0 :           if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
    9274            0 :             *dynamic_check = 128;
    9275            0 :           return loop_1_byte;
    9276              :         }
    9277         4235 :       alg = decide_alg (count, new_expected_size, min_size, max_size,
    9278              :                         memset, zero_memset, dst_as, src_as,
    9279              :                         dynamic_check, noalign, true);
    9280         4235 :       gcc_assert (*dynamic_check == -1);
    9281         4235 :       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
    9282            8 :         *dynamic_check = max;
    9283              :       else
    9284         4227 :         gcc_assert (alg != libcall);
    9285         4235 :       return alg;
    9286              :     }
    9287              : 
    9288              :   /* Try to use some reasonable fallback algorithm.  Note that for
    9289              :      non-default address spaces we default to a loop instead of
    9290              :      a libcall.  */
    9291              : 
    9292        72616 :   bool have_as = !(ADDR_SPACE_GENERIC_P (dst_as)
    9293              :                    && ADDR_SPACE_GENERIC_P (src_as));
    9294              : 
    9295        72616 :   return (alg_usable_p (algs->unknown_size, memset, dst_as, src_as)
    9296        72616 :           ? algs->unknown_size : have_as ? loop : libcall);
    9297              : }
    9298              : 
    9299              : /* Decide on alignment.  We know that the operand is already aligned to ALIGN
    9300              :    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
    9301              : static int
    9302        35285 : decide_alignment (int align,
    9303              :                   enum stringop_alg alg,
    9304              :                   int expected_size,
    9305              :                   machine_mode move_mode)
    9306              : {
    9307        35285 :   int desired_align = 0;
    9308              : 
    9309        35285 :   gcc_assert (alg != no_stringop);
    9310              : 
    9311        35285 :   if (alg == libcall)
    9312              :     return 0;
    9313        35285 :   if (move_mode == VOIDmode)
    9314              :     return 0;
    9315              : 
    9316        35285 :   desired_align = GET_MODE_SIZE (move_mode);
    9317              :   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
    9318              :      copying whole cacheline at once.  */
    9319        35285 :   if (TARGET_CPU_P (PENTIUMPRO)
    9320            0 :       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
    9321        35285 :     desired_align = 8;
    9322              : 
    9323        35285 :   if (optimize_size)
    9324         9681 :     desired_align = 1;
    9325        35285 :   if (desired_align < align)
    9326              :     desired_align = align;
    9327        35285 :   if (expected_size != -1 && expected_size < 4)
    9328            0 :     desired_align = align;
    9329              : 
    9330              :   return desired_align;
    9331              : }
    9332              : 
    9333              : 
    9334              : /* Helper function for memcpy.  For QImode value 0xXY produce
    9335              :    0xXYXYXYXY of wide specified by MODE.  This is essentially
    9336              :    a * 0x10101010, but we can do slightly better than
    9337              :    synth_mult by unwinding the sequence by hand on CPUs with
    9338              :    slow multiply.  */
    9339              : static rtx
    9340        16798 : promote_duplicated_reg (machine_mode mode, rtx val)
    9341              : {
    9342        16798 :   if (val == const0_rtx)
    9343        14992 :     return copy_to_mode_reg (mode, CONST0_RTX (mode));
    9344              : 
    9345         1806 :   machine_mode valmode = GET_MODE (val);
    9346         1806 :   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
    9347              :     {
    9348              :       /* Duplicate the scalar value for integer vector.  */
    9349         1487 :       gcc_assert ((val == const0_rtx || val == constm1_rtx)
    9350              :                   || GET_MODE_INNER (mode) == valmode);
    9351          755 :       rtx dup = gen_reg_rtx (mode);
    9352          755 :       bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
    9353              :                                                    val);
    9354          755 :       gcc_assert (ok);
    9355              :       return dup;
    9356              :     }
    9357              : 
    9358         1051 :   rtx tmp;
    9359         1051 :   int nops = mode == DImode ? 3 : 2;
    9360              : 
    9361           38 :   gcc_assert (mode == SImode || mode == DImode);
    9362         1051 :   if (CONST_INT_P (val))
    9363              :     {
    9364          762 :       HOST_WIDE_INT v = INTVAL (val) & 255;
    9365              : 
    9366          762 :       v |= v << 8;
    9367          762 :       v |= v << 16;
    9368          762 :       if (mode == DImode)
    9369          736 :         v |= (v << 16) << 16;
    9370          762 :       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
    9371              :     }
    9372              : 
    9373          289 :   if (valmode == VOIDmode)
    9374              :     valmode = QImode;
    9375          289 :   if (valmode != QImode)
    9376            0 :     val = gen_lowpart (QImode, val);
    9377          289 :   if (mode == QImode)
    9378              :     return val;
    9379          289 :   if (!TARGET_PARTIAL_REG_STALL)
    9380          289 :     nops--;
    9381          289 :   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
    9382          289 :       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
    9383          289 :       <= (ix86_cost->shift_const + ix86_cost->add) * nops
    9384          289 :           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
    9385              :     {
    9386          289 :       rtx reg = convert_modes (mode, QImode, val, true);
    9387          289 :       tmp = promote_duplicated_reg (mode, const1_rtx);
    9388          289 :       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
    9389          289 :                                   OPTAB_DIRECT);
    9390              :     }
    9391              :   else
    9392              :     {
    9393            0 :       rtx reg = convert_modes (mode, QImode, val, true);
    9394              : 
    9395            0 :       if (!TARGET_PARTIAL_REG_STALL)
    9396            0 :         emit_insn (gen_insv_1 (mode, reg, reg));
    9397              :       else
    9398              :         {
    9399            0 :           tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
    9400              :                                      NULL, 1, OPTAB_DIRECT);
    9401            0 :           reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
    9402              :                                      OPTAB_DIRECT);
    9403              :         }
    9404            0 :       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
    9405              :                                  NULL, 1, OPTAB_DIRECT);
    9406            0 :       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
    9407            0 :       if (mode == SImode)
    9408              :         return reg;
    9409            0 :       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
    9410              :                                  NULL, 1, OPTAB_DIRECT);
    9411            0 :       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
    9412            0 :       return reg;
    9413              :     }
    9414              : }
    9415              : 
    9416              : /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
    9417              :    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
    9418              :    alignment from ALIGN to DESIRED_ALIGN.  */
    9419              : static rtx
    9420        12439 : promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
    9421              :                                 int align)
    9422              : {
    9423        12439 :   rtx promoted_val;
    9424              : 
    9425        12439 :   if (TARGET_64BIT
    9426        10964 :       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
    9427         4511 :     promoted_val = promote_duplicated_reg (DImode, val);
    9428         7928 :   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
    9429         6121 :     promoted_val = promote_duplicated_reg (SImode, val);
    9430         1807 :   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
    9431            0 :     promoted_val = promote_duplicated_reg (HImode, val);
    9432              :   else
    9433              :     promoted_val = val;
    9434              : 
    9435        12439 :   return promoted_val;
    9436              : }
    9437              : 
    9438              : /* Copy the address to a Pmode register.  This is used for x32 to
    9439              :    truncate DImode TLS address to a SImode register. */
    9440              : 
    9441              : static rtx
    9442        70367 : ix86_copy_addr_to_reg (rtx addr)
    9443              : {
    9444        70367 :   rtx reg;
    9445        74954 :   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
    9446              :     {
    9447        70367 :       reg = copy_addr_to_reg (addr);
    9448        70367 :       REG_POINTER (reg) = 1;
    9449        70367 :       return reg;
    9450              :     }
    9451              :   else
    9452              :     {
    9453            0 :       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
    9454            0 :       reg = copy_to_mode_reg (DImode, addr);
    9455            0 :       REG_POINTER (reg) = 1;
    9456            0 :       return gen_rtx_SUBREG (SImode, reg, 0);
    9457              :     }
    9458              : }
    9459              : 
    9460              : /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
    9461              :    operations when profitable.  The code depends upon architecture, block size
    9462              :    and alignment, but always has one of the following overall structures:
    9463              : 
    9464              :    Aligned move sequence:
    9465              : 
    9466              :      1) Prologue guard: Conditional that jumps up to epilogues for small
    9467              :         blocks that can be handled by epilogue alone.  This is faster
    9468              :         but also needed for correctness, since prologue assume the block
    9469              :         is larger than the desired alignment.
    9470              : 
    9471              :         Optional dynamic check for size and libcall for large
    9472              :         blocks is emitted here too, with -minline-stringops-dynamically.
    9473              : 
    9474              :      2) Prologue: copy first few bytes in order to get destination
    9475              :         aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
    9476              :         than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
    9477              :         copied.  We emit either a jump tree on power of two sized
    9478              :         blocks, or a byte loop.
    9479              : 
    9480              :      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
    9481              :         with specified algorithm.
    9482              : 
    9483              :      4) Epilogue: code copying tail of the block that is too small to be
    9484              :         handled by main body (or up to size guarded by prologue guard).
    9485              : 
    9486              :   Misaligned move sequence
    9487              : 
    9488              :      1) missaligned move prologue/epilogue containing:
    9489              :         a) Prologue handling small memory blocks and jumping to done_label
    9490              :            (skipped if blocks are known to be large enough)
    9491              :         b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
    9492              :            needed by single possibly misaligned move
    9493              :            (skipped if alignment is not needed)
    9494              :         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
    9495              : 
    9496              :      2) Zero size guard dispatching to done_label, if needed
    9497              : 
    9498              :      3) dispatch to library call, if needed,
    9499              : 
    9500              :      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
    9501              :         with specified algorithm.  */
    9502              : bool
    9503       148683 : ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
    9504              :                            rtx align_exp, rtx expected_align_exp,
    9505              :                            rtx expected_size_exp, rtx min_size_exp,
    9506              :                            rtx max_size_exp, rtx probable_max_size_exp,
    9507              :                            bool issetmem)
    9508              : {
    9509       148683 :   rtx destreg;
    9510       148683 :   rtx srcreg = NULL;
    9511       148683 :   rtx_code_label *label = NULL;
    9512       148683 :   rtx tmp;
    9513       148683 :   rtx_code_label *jump_around_label = NULL;
    9514       148683 :   HOST_WIDE_INT align = 1;
    9515       148683 :   unsigned HOST_WIDE_INT count = 0;
    9516       148683 :   HOST_WIDE_INT expected_size = -1;
    9517       148683 :   int size_needed = 0, epilogue_size_needed;
    9518       148683 :   int desired_align = 0, align_bytes = 0;
    9519       148683 :   enum stringop_alg alg;
    9520       148683 :   rtx promoted_val = NULL;
    9521       148683 :   rtx vec_promoted_val = NULL;
    9522       148683 :   bool force_loopy_epilogue = false;
    9523       148683 :   int dynamic_check;
    9524       148683 :   bool need_zero_guard = false;
    9525       148683 :   bool noalign;
    9526       148683 :   machine_mode move_mode = VOIDmode;
    9527       148683 :   int unroll_factor = 1;
    9528              :   /* TODO: Once value ranges are available, fill in proper data.  */
    9529       148683 :   unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
    9530       148683 :   unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U;
    9531       148683 :   unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
    9532       148683 :   bool misaligned_prologue_used = false;
    9533       148683 :   addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC;
    9534              : 
    9535       148683 :   if (CONST_INT_P (align_exp))
    9536       148683 :     align = INTVAL (align_exp);
    9537              :   /* i386 can do misaligned access on reasonably increased cost.  */
    9538       148683 :   if (CONST_INT_P (expected_align_exp)
    9539       148683 :       && INTVAL (expected_align_exp) > align)
    9540              :     align = INTVAL (expected_align_exp);
    9541              :   /* ALIGN is the minimum of destination and source alignment, but we care here
    9542              :      just about destination alignment.  */
    9543       141727 :   else if (!issetmem
    9544       238893 :            && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
    9545         3207 :     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
    9546              : 
    9547       148683 :   if (CONST_INT_P (count_exp))
    9548              :     {
    9549        67337 :       min_size = max_size = probable_max_size = count = expected_size
    9550        67337 :         = INTVAL (count_exp);
    9551              :       /* When COUNT is 0, there is nothing to do.  */
    9552        67337 :       if (!count)
    9553              :         return true;
    9554              :     }
    9555              :   else
    9556              :     {
    9557        81346 :       if (min_size_exp)
    9558        81346 :         min_size = INTVAL (min_size_exp);
    9559        81346 :       if (max_size_exp)
    9560        71049 :         max_size = INTVAL (max_size_exp);
    9561        81346 :       if (probable_max_size_exp)
    9562        72963 :         probable_max_size = INTVAL (probable_max_size_exp);
    9563        81346 :       if (CONST_INT_P (expected_size_exp))
    9564        81346 :         expected_size = INTVAL (expected_size_exp);
    9565              :      }
    9566              : 
    9567              :   /* Make sure we don't need to care about overflow later on.  */
    9568       148681 :   if (count > (HOST_WIDE_INT_1U << 30))
    9569              :     return false;
    9570              : 
    9571       148507 :   dst_as = MEM_ADDR_SPACE (dst);
    9572       148507 :   if (!issetmem)
    9573       104011 :     src_as = MEM_ADDR_SPACE (src);
    9574              : 
    9575              :   /* Step 0: Decide on preferred algorithm, desired alignment and
    9576              :      size of chunks to be copied by main loop.  */
    9577       148507 :   alg = decide_alg (count, expected_size, min_size, probable_max_size,
    9578        44496 :                     issetmem, issetmem && val_exp == const0_rtx,
    9579              :                     dst_as, src_as, &dynamic_check, &noalign, false);
    9580              : 
    9581       148507 :   if (dump_file)
    9582            7 :     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
    9583            7 :              stringop_alg_names[alg]);
    9584              : 
    9585       148507 :   if (alg == libcall)
    9586              :     return false;
    9587        35285 :   gcc_assert (alg != no_stringop);
    9588              : 
    9589        35285 :   if (!count)
    9590        16886 :     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
    9591        35285 :   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
    9592        35285 :   if (!issetmem)
    9593        22846 :     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
    9594              : 
    9595        35285 :   bool aligned_dstmem = false;
    9596        35285 :   unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
    9597        35285 :   bool single_insn_p = count && count <= nunits;
    9598        35285 :   if (single_insn_p)
    9599              :     {
    9600              :       /* If it can be done with a single instruction, use vector
    9601              :          instruction and don't align destination.  */
    9602            6 :       alg = vector_loop;
    9603            6 :       noalign = true;
    9604            6 :       dynamic_check = -1;
    9605              :     }
    9606              : 
    9607        35285 :   unroll_factor = 1;
    9608        35285 :   move_mode = word_mode;
    9609        35285 :   switch (alg)
    9610              :     {
    9611            0 :     case libcall:
    9612            0 :     case no_stringop:
    9613            0 :     case last_alg:
    9614            0 :       gcc_unreachable ();
    9615         1690 :     case loop_1_byte:
    9616         1690 :       need_zero_guard = true;
    9617         1690 :       move_mode = QImode;
    9618         1690 :       break;
    9619           48 :     case loop:
    9620           48 :       need_zero_guard = true;
    9621           48 :       break;
    9622           20 :     case unrolled_loop:
    9623           20 :       need_zero_guard = true;
    9624           20 :       unroll_factor = (TARGET_64BIT ? 4 : 2);
    9625              :       break;
    9626        16954 :     case vector_loop:
    9627        16954 :       need_zero_guard = true;
    9628        16954 :       unroll_factor = 4;
    9629              :       /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes.  */
    9630        16954 :       nunits /= GET_MODE_SIZE (word_mode);
    9631        16954 :       if (nunits > 1)
    9632              :         {
    9633        16950 :           move_mode = mode_for_vector (word_mode, nunits).require ();
    9634        16950 :           gcc_assert (optab_handler (mov_optab, move_mode)
    9635              :                       != CODE_FOR_nothing);
    9636              :         }
    9637              :       break;
    9638           24 :     case rep_prefix_8_byte:
    9639           24 :       move_mode = DImode;
    9640           24 :       break;
    9641        10582 :     case rep_prefix_4_byte:
    9642        10582 :       move_mode = SImode;
    9643        10582 :       break;
    9644         5967 :     case rep_prefix_1_byte:
    9645         5967 :       move_mode = QImode;
    9646         5967 :       break;
    9647              :     }
    9648        35285 :   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
    9649        35285 :   epilogue_size_needed = size_needed;
    9650              : 
    9651              :   /* If we are going to call any library calls conditionally, make sure any
    9652              :      pending stack adjustment happen before the first conditional branch,
    9653              :      otherwise they will be emitted before the library call only and won't
    9654              :      happen from the other branches.  */
    9655        35285 :   if (dynamic_check != -1)
    9656            7 :     do_pending_stack_adjust ();
    9657              : 
    9658        35285 :   desired_align = decide_alignment (align, alg, expected_size, move_mode);
    9659        35285 :   if (!TARGET_ALIGN_STRINGOPS || noalign)
    9660        33508 :     align = desired_align;
    9661              : 
    9662              :   /* Step 1: Prologue guard.  */
    9663              : 
    9664              :   /* Alignment code needs count to be in register.  */
    9665        35285 :   if (CONST_INT_P (count_exp) && desired_align > align)
    9666              :     {
    9667           20 :       if (INTVAL (count_exp) > desired_align
    9668           20 :           && INTVAL (count_exp) > size_needed)
    9669              :         {
    9670           20 :           align_bytes
    9671           20 :             = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
    9672           20 :           if (align_bytes <= 0)
    9673              :             align_bytes = 0;
    9674              :           else
    9675            0 :             align_bytes = desired_align - align_bytes;
    9676              :         }
    9677            0 :       if (align_bytes == 0)
    9678           40 :         count_exp = force_reg (counter_mode (count_exp), count_exp);
    9679              :     }
    9680        35285 :   gcc_assert (desired_align >= 1 && align >= 1);
    9681              : 
    9682        35285 :   if (!single_insn_p)
    9683              :     {
    9684              :       /* Misaligned move sequences handle both prologue and epilogue
    9685              :          at once.  Default code generation results in a smaller code
    9686              :          for large alignments and also avoids redundant job when sizes
    9687              :          are known precisely.  */
    9688        35279 :       misaligned_prologue_used
    9689        70558 :         = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
    9690        35273 :            && MAX (desired_align, epilogue_size_needed) <= 32
    9691        17998 :            && desired_align <= epilogue_size_needed
    9692        41430 :            && ((desired_align > align && !align_bytes)
    9693         6130 :                || (!count && epilogue_size_needed > 1)));
    9694              : 
    9695              :       /* Destination is aligned after the misaligned prologue.  */
    9696        35279 :       aligned_dstmem = misaligned_prologue_used;
    9697              : 
    9698        35279 :       if (noalign && !misaligned_prologue_used)
    9699              :         {
    9700              :           /* Also use misaligned prologue if alignment isn't needed and
    9701              :              destination isn't aligned.   Since alignment isn't needed,
    9702              :              the destination after prologue won't be aligned.  */
    9703        33502 :           aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
    9704        33502 :                             <= MEM_ALIGN (dst));
    9705        33502 :           if (!aligned_dstmem)
    9706        11121 :             misaligned_prologue_used = true;
    9707              :         }
    9708              :     }
    9709              : 
    9710              :   /* Do the cheap promotion to allow better CSE across the
    9711              :      main loop and epilogue (ie one load of the big constant in the
    9712              :      front of all code.
    9713              :      For now the misaligned move sequences do not have fast path
    9714              :      without broadcasting.  */
    9715        35285 :   if (issetmem
    9716        12439 :       && (alg == vector_loop
    9717         6562 :           || CONST_INT_P (val_exp)
    9718           48 :           || misaligned_prologue_used))
    9719              :     {
    9720         6514 :       if (alg == vector_loop)
    9721              :         {
    9722         5877 :           promoted_val = promote_duplicated_reg_to_size (val_exp,
    9723        11754 :                                                          GET_MODE_SIZE (word_mode),
    9724              :                                                          desired_align, align);
    9725              :           /* Duplicate the promoted scalar value if not 0 nor -1.  */
    9726         5877 :           vec_promoted_val
    9727         5877 :             = promote_duplicated_reg (move_mode,
    9728         5877 :                                       (val_exp == const0_rtx
    9729          755 :                                        || val_exp == constm1_rtx)
    9730              :                                       ? val_exp : promoted_val);
    9731              :         }
    9732              :       else
    9733              :         {
    9734         6514 :           promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
    9735              :                                                          desired_align, align);
    9736              :         }
    9737              :     }
    9738              :   /* Misaligned move sequences handles both prologues and epilogues at once.
    9739              :      Default code generation results in smaller code for large alignments and
    9740              :      also avoids redundant job when sizes are known precisely.  */
    9741        35237 :   if (misaligned_prologue_used)
    9742              :     {
    9743              :       /* Misaligned move prologue handled small blocks by itself.  */
    9744        11142 :       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
    9745        11142 :            (dst, src, &destreg, &srcreg,
    9746              :             move_mode, promoted_val, vec_promoted_val,
    9747              :             &count_exp,
    9748              :             &jump_around_label,
    9749        11142 :             desired_align < align
    9750            0 :             ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
    9751              :             desired_align, align, &min_size, dynamic_check, issetmem);
    9752        11142 :       if (!issetmem)
    9753         8549 :         src = change_address (src, BLKmode, srcreg);
    9754        11142 :       dst = change_address (dst, BLKmode, destreg);
    9755        11142 :       if (aligned_dstmem)
    9756           21 :         set_mem_align (dst, desired_align * BITS_PER_UNIT);
    9757        11142 :       epilogue_size_needed = 0;
    9758        11142 :       if (need_zero_guard
    9759        10879 :           && min_size < (unsigned HOST_WIDE_INT) size_needed)
    9760              :         {
    9761              :           /* It is possible that we copied enough so the main loop will not
    9762              :              execute.  */
    9763         7854 :           gcc_assert (size_needed > 1);
    9764         7854 :           if (jump_around_label == NULL_RTX)
    9765           50 :             jump_around_label = gen_label_rtx ();
    9766        15708 :           emit_cmp_and_jump_insns (count_exp,
    9767              :                                    GEN_INT (size_needed),
    9768              :                                    LTU, 0, counter_mode (count_exp), 1, jump_around_label);
    9769         7854 :           if (expected_size == -1
    9770           53 :               || expected_size < (desired_align - align) / 2 + size_needed)
    9771         7802 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9772              :           else
    9773           52 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9774              :         }
    9775              :     }
    9776              :   /* Ensure that alignment prologue won't copy past end of block.  */
    9777        24143 :   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
    9778              :     {
    9779        16486 :       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
    9780              :       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
    9781              :          Make sure it is power of 2.  */
    9782        16486 :       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
    9783              : 
    9784              :       /* To improve performance of small blocks, we jump around the VAL
    9785              :          promoting mode.  This mean that if the promoted VAL is not constant,
    9786              :          we might not use it in the epilogue and have to use byte
    9787              :          loop variant.  */
    9788        16486 :       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
    9789        16486 :         force_loopy_epilogue = true;
    9790        16486 :       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9791        16478 :           || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9792              :         {
    9793              :           /* If main algorithm works on QImode, no epilogue is needed.
    9794              :              For small sizes just don't align anything.  */
    9795         2212 :           if (size_needed == 1)
    9796            0 :             desired_align = align;
    9797              :           else
    9798         2212 :             goto epilogue;
    9799              :         }
    9800        14274 :       else if (!count
    9801          255 :                && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9802              :         {
    9803          255 :           label = gen_label_rtx ();
    9804          510 :           emit_cmp_and_jump_insns (count_exp,
    9805              :                                    GEN_INT (epilogue_size_needed),
    9806              :                                    LTU, 0, counter_mode (count_exp), 1, label);
    9807          255 :           if (expected_size == -1 || expected_size < epilogue_size_needed)
    9808          255 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9809              :           else
    9810            0 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9811              :         }
    9812              :     }
    9813              : 
    9814              :   /* Emit code to decide on runtime whether library call or inline should be
    9815              :      used.  */
    9816        33073 :   if (dynamic_check != -1)
    9817              :     {
    9818            7 :       if (!issetmem && CONST_INT_P (count_exp))
    9819              :         {
    9820            1 :           if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
    9821              :             {
    9822            1 :               emit_block_copy_via_libcall (dst, src, count_exp);
    9823            1 :               count_exp = const0_rtx;
    9824            1 :               goto epilogue;
    9825              :             }
    9826              :         }
    9827              :       else
    9828              :         {
    9829            6 :           rtx_code_label *hot_label = gen_label_rtx ();
    9830            6 :           if (jump_around_label == NULL_RTX)
    9831            1 :             jump_around_label = gen_label_rtx ();
    9832           12 :           emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
    9833              :                                    LEU, 0, counter_mode (count_exp),
    9834              :                                    1, hot_label);
    9835            6 :           predict_jump (REG_BR_PROB_BASE * 90 / 100);
    9836            6 :           if (issetmem)
    9837            4 :             set_storage_via_libcall (dst, count_exp, val_exp);
    9838              :           else
    9839            2 :             emit_block_copy_via_libcall (dst, src, count_exp);
    9840            6 :           emit_jump (jump_around_label);
    9841            6 :           emit_label (hot_label);
    9842              :         }
    9843              :     }
    9844              : 
    9845              :   /* Step 2: Alignment prologue.  */
    9846              :   /* Do the expensive promotion once we branched off the small blocks.  */
    9847        33072 :   if (issetmem && !promoted_val)
    9848           48 :     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
    9849              :                                                    desired_align, align);
    9850              : 
    9851        33072 :   if (desired_align > align && !misaligned_prologue_used)
    9852              :     {
    9853            7 :       if (align_bytes == 0)
    9854              :         {
    9855              :           /* Except for the first move in prologue, we no longer know
    9856              :              constant offset in aliasing info.  It don't seems to worth
    9857              :              the pain to maintain it for the first move, so throw away
    9858              :              the info early.  */
    9859            7 :           dst = change_address (dst, BLKmode, destreg);
    9860            7 :           if (!issetmem)
    9861            5 :             src = change_address (src, BLKmode, srcreg);
    9862            7 :           dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
    9863              :                                             promoted_val, vec_promoted_val,
    9864              :                                             count_exp, align, desired_align,
    9865              :                                             issetmem);
    9866              :           /* At most desired_align - align bytes are copied.  */
    9867            7 :           if (min_size < (unsigned)(desired_align - align))
    9868            0 :             min_size = 0;
    9869              :           else
    9870            7 :             min_size -= desired_align - align;
    9871              :         }
    9872              :       else
    9873              :         {
    9874              :           /* If we know how many bytes need to be stored before dst is
    9875              :              sufficiently aligned, maintain aliasing info accurately.  */
    9876            0 :           dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
    9877              :                                                            srcreg,
    9878              :                                                            promoted_val,
    9879              :                                                            vec_promoted_val,
    9880              :                                                            desired_align,
    9881              :                                                            align_bytes,
    9882              :                                                            issetmem);
    9883              : 
    9884            0 :           count_exp = plus_constant (counter_mode (count_exp),
    9885            0 :                                      count_exp, -align_bytes);
    9886            0 :           count -= align_bytes;
    9887            0 :           min_size -= align_bytes;
    9888            0 :           max_size -= align_bytes;
    9889              :         }
    9890            7 :       if (need_zero_guard
    9891            7 :           && min_size < (unsigned HOST_WIDE_INT) size_needed
    9892            1 :           && (count < (unsigned HOST_WIDE_INT) size_needed
    9893            0 :               || (align_bytes == 0
    9894            0 :                   && count < ((unsigned HOST_WIDE_INT) size_needed
    9895            0 :                               + desired_align - align))))
    9896              :         {
    9897              :           /* It is possible that we copied enough so the main loop will not
    9898              :              execute.  */
    9899            1 :           gcc_assert (size_needed > 1);
    9900            1 :           if (label == NULL_RTX)
    9901            0 :             label = gen_label_rtx ();
    9902            2 :           emit_cmp_and_jump_insns (count_exp,
    9903              :                                    GEN_INT (size_needed),
    9904              :                                    LTU, 0, counter_mode (count_exp), 1, label);
    9905            1 :           if (expected_size == -1
    9906            0 :               || expected_size < (desired_align - align) / 2 + size_needed)
    9907            1 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9908              :           else
    9909            0 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9910              :         }
    9911              :     }
    9912        33072 :   if (label && size_needed == 1)
    9913              :     {
    9914            0 :       emit_label (label);
    9915            0 :       LABEL_NUSES (label) = 1;
    9916            0 :       label = NULL;
    9917            0 :       epilogue_size_needed = 1;
    9918            0 :       if (issetmem)
    9919            0 :         promoted_val = val_exp;
    9920              :     }
    9921        33072 :   else if (label == NULL_RTX && !misaligned_prologue_used)
    9922        21676 :     epilogue_size_needed = size_needed;
    9923              : 
    9924              :   /* Step 3: Main loop.  */
    9925              : 
    9926        33072 :   switch (alg)
    9927              :     {
    9928            0 :     case libcall:
    9929            0 :     case no_stringop:
    9930            0 :     case last_alg:
    9931            0 :       gcc_unreachable ();
    9932         1758 :     case loop_1_byte:
    9933         1758 :     case loop:
    9934         1758 :     case unrolled_loop:
    9935         1758 :       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
    9936              :                                      count_exp, move_mode, unroll_factor,
    9937              :                                      expected_size, issetmem);
    9938         1758 :       break;
    9939        14741 :     case vector_loop:
    9940        14741 :       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
    9941              :                                      vec_promoted_val, count_exp, move_mode,
    9942              :                                      unroll_factor, expected_size, issetmem);
    9943        14741 :       break;
    9944        16573 :     case rep_prefix_8_byte:
    9945        16573 :     case rep_prefix_4_byte:
    9946        16573 :     case rep_prefix_1_byte:
    9947        16573 :       expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
    9948              :                                        val_exp, count_exp, move_mode, issetmem);
    9949        16573 :       break;
    9950              :     }
    9951              :   /* Adjust properly the offset of src and dest memory for aliasing.  */
    9952        33072 :   if (CONST_INT_P (count_exp))
    9953              :     {
    9954        18370 :       if (!issetmem)
    9955         8438 :         src = adjust_automodify_address_nv (src, BLKmode, srcreg,
    9956              :                                             (count / size_needed) * size_needed);
    9957        18370 :       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
    9958              :                                           (count / size_needed) * size_needed);
    9959              :     }
    9960              :   else
    9961              :     {
    9962        14702 :       if (!issetmem)
    9963        12471 :         src = change_address (src, BLKmode, srcreg);
    9964        14702 :       dst = change_address (dst, BLKmode, destreg);
    9965              :     }
    9966              : 
    9967              :   /* Step 4: Epilogue to copy the remaining bytes.  */
    9968        35285 :  epilogue:
    9969        35285 :   if (label)
    9970              :     {
    9971              :       /* When the main loop is done, COUNT_EXP might hold original count,
    9972              :          while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
    9973              :          Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
    9974              :          bytes. Compensate if needed.  */
    9975              : 
    9976          255 :       if (size_needed < epilogue_size_needed)
    9977              :         {
    9978            0 :           tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
    9979            0 :                                      GEN_INT (size_needed - 1), count_exp, 1,
    9980              :                                      OPTAB_DIRECT);
    9981            0 :           if (tmp != count_exp)
    9982            0 :             emit_move_insn (count_exp, tmp);
    9983              :         }
    9984          255 :       emit_label (label);
    9985          255 :       LABEL_NUSES (label) = 1;
    9986              :     }
    9987              : 
    9988        35285 :   if (count_exp != const0_rtx && epilogue_size_needed > 1)
    9989              :     {
    9990        16486 :       if (force_loopy_epilogue)
    9991            0 :         expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
    9992              :                                          epilogue_size_needed);
    9993              :       else
    9994              :         {
    9995        16486 :           if (issetmem)
    9996         8039 :             expand_setmem_epilogue (dst, destreg, promoted_val,
    9997              :                                     vec_promoted_val, count_exp,
    9998              :                                     epilogue_size_needed);
    9999              :           else
   10000         8447 :             expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
   10001              :                                     epilogue_size_needed);
   10002              :         }
   10003              :     }
   10004        35285 :   if (jump_around_label)
   10005         7856 :     emit_label (jump_around_label);
   10006              :   return true;
   10007              : }
   10008              : 
   10009              : /* Fully unroll memmove of known size with up to 8 registers.  */
   10010              : 
   10011              : static bool
   10012         2148 : ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg,
   10013              :                            unsigned HOST_WIDE_INT count,
   10014              :                            machine_mode mode)
   10015              : {
   10016              :   /* If 8 registers registers can cover all memory, load them into
   10017              :      registers and store them together to avoid possible address
   10018              :      overlap between source and destination.  */
   10019         2148 :   unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode);
   10020         2148 :   if (moves == 0)
   10021              :     {
   10022            0 :       mode = smallest_int_mode_for_size
   10023            0 :         (count * BITS_PER_UNIT).require ();
   10024            0 :       if (count == GET_MODE_SIZE (mode))
   10025              :         moves = 1;
   10026              :       else
   10027              :         {
   10028              :           /* Reduce the smallest move size by half so that MOVES == 1.  */
   10029            0 :           mode = smallest_int_mode_for_size
   10030            0 :             (GET_MODE_BITSIZE (mode) / 2).require ();
   10031            0 :           moves = count / GET_MODE_SIZE (mode);
   10032            0 :           gcc_assert (moves == 1);
   10033              :         }
   10034              :     }
   10035         2148 :   else if (moves > 8)
   10036              :     return false;
   10037              : 
   10038         2134 :   unsigned int i;
   10039         2134 :   rtx tmp[9];
   10040              : 
   10041         4853 :   for (i = 0; i < moves; i++)
   10042         2719 :     tmp[i] = gen_reg_rtx (mode);
   10043              : 
   10044         2134 :   rtx srcmem = change_address (src, mode, srcreg);
   10045         6987 :   for (i = 0; i < moves; i++)
   10046              :     {
   10047         2719 :       emit_move_insn (tmp[i], srcmem);
   10048         5438 :       srcmem = offset_address (srcmem,
   10049         2719 :                                GEN_INT (GET_MODE_SIZE (mode)),
   10050         2719 :                                GET_MODE_SIZE (mode));
   10051              :     }
   10052              : 
   10053         2134 :   unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1);
   10054         2134 :   machine_mode epilogue_mode = VOIDmode;
   10055         2134 :   if (epilogue_size)
   10056              :     {
   10057              :       /* Handle the remaining bytes with overlapping move.  */
   10058         1953 :       epilogue_mode = smallest_int_mode_for_size
   10059         1953 :         (epilogue_size * BITS_PER_UNIT).require ();
   10060         1953 :       tmp[8] = gen_reg_rtx (epilogue_mode);
   10061         1953 :       srcmem = adjust_address (srcmem, epilogue_mode, 0);
   10062         1953 :       srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1);
   10063         3906 :       srcmem = offset_address (srcmem,
   10064         1953 :                                GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
   10065         1953 :                                GET_MODE_SIZE (epilogue_mode));
   10066         1953 :       emit_move_insn (tmp[8], srcmem);
   10067              :     }
   10068              : 
   10069         2134 :   rtx destmem = change_address (dst, mode, destreg);
   10070         6987 :   for (i = 0; i < moves; i++)
   10071              :     {
   10072         2719 :       emit_move_insn (destmem, tmp[i]);
   10073         5438 :       destmem = offset_address (destmem,
   10074         2719 :                                 GEN_INT (GET_MODE_SIZE (mode)),
   10075         2719 :                                 GET_MODE_SIZE (mode));
   10076              :     }
   10077              : 
   10078         2134 :   if (epilogue_size)
   10079              :     {
   10080              :       /* Use overlapping move.  */
   10081         1953 :       destmem = adjust_address (destmem, epilogue_mode, 0);
   10082         1953 :       destmem = offset_address (destmem, GEN_INT (epilogue_size), 1);
   10083         3906 :       destmem = offset_address (destmem,
   10084         1953 :                                 GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
   10085         1953 :                                 GET_MODE_SIZE (epilogue_mode));
   10086         1953 :       emit_move_insn (destmem, tmp[8]);
   10087              :     }
   10088              : 
   10089              :   return true;
   10090              : }
   10091              : 
   10092              : /* Expand memmove of size with MOVES * mode size and MOVES <= 4.  If
   10093              :    FORWARD is true, copy forward.  Otherwise copy backward.  */
   10094              : 
   10095              : static void
   10096         2950 : ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode,
   10097              :                            unsigned int moves, bool forward)
   10098              : {
   10099         2950 :   gcc_assert (moves <= 4);
   10100              : 
   10101              :   unsigned int i;
   10102              :   rtx tmp[8];
   10103              : 
   10104        14750 :   for (i = 0; i < moves; i++)
   10105        11800 :     tmp[i] = gen_reg_rtx (mode);
   10106              : 
   10107         2950 :   rtx step;
   10108         2950 :   if (forward)
   10109         2950 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10110              :   else
   10111         2950 :     step = GEN_INT (-GET_MODE_SIZE (mode));
   10112              : 
   10113              :   /* Load MOVES.  */
   10114        11800 :   for (i = 0; i < moves - 1; i++)
   10115              :     {
   10116         8850 :       emit_move_insn (tmp[i], srcmem);
   10117        17700 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10118              :     }
   10119         2950 :   emit_move_insn (tmp[i], srcmem);
   10120              : 
   10121              :   /* Store MOVES.  */
   10122        14750 :   for (i = 0; i < moves - 1; i++)
   10123              :     {
   10124         8850 :       emit_move_insn (destmem, tmp[i]);
   10125        17700 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10126              :     }
   10127         2950 :   emit_move_insn (destmem, tmp[i]);
   10128         2950 : }
   10129              : 
   10130              : /* Load MOVES of mode size into REGS.  If LAST is true, load the
   10131              :    last MOVES.  Otherwise, load the first MOVES.  */
   10132              : 
   10133              : static void
   10134         2950 : ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp,
   10135              :                          machine_mode mode, unsigned int moves,
   10136              :                          rtx regs[], bool last)
   10137              : {
   10138         2950 :   unsigned int i;
   10139              : 
   10140        14750 :   for (i = 0; i < moves; i++)
   10141        11800 :     regs[i] = gen_reg_rtx (mode);
   10142              : 
   10143         2950 :   rtx srcmem = change_address (src, mode, srcreg);
   10144         2950 :   rtx step;
   10145         2950 :   if (last)
   10146              :     {
   10147         1475 :       srcmem = offset_address (srcmem, count_exp, 1);
   10148         2950 :       step = GEN_INT (-GET_MODE_SIZE (mode));
   10149         2950 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10150              :     }
   10151              :   else
   10152         2950 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10153              : 
   10154        11800 :   for (i = 0; i < moves - 1; i++)
   10155              :     {
   10156         8850 :       emit_move_insn (regs[i], srcmem);
   10157        17700 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10158              :     }
   10159         2950 :   emit_move_insn (regs[i], srcmem);
   10160         2950 : }
   10161              : 
   10162              : /* Store MOVES of mode size into REGS.  If LAST is true, store the
   10163              :    last MOVES.  Otherwise, store the first MOVES.  */
   10164              : 
   10165              : static void
   10166         2950 : ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp,
   10167              :                           machine_mode mode, unsigned int moves,
   10168              :                           rtx regs[], bool last)
   10169              : {
   10170         2950 :   unsigned int i;
   10171              : 
   10172         2950 :   rtx destmem = change_address (dst, mode, destreg);
   10173         2950 :   rtx step;
   10174         2950 :   if (last)
   10175              :     {
   10176         1475 :       destmem = offset_address (destmem, count_exp, 1);
   10177         2950 :       step = GEN_INT (-GET_MODE_SIZE (mode));
   10178         2950 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10179              :     }
   10180              :   else
   10181         2950 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10182              : 
   10183        11800 :   for (i = 0; i < moves - 1; i++)
   10184              :     {
   10185         8850 :       emit_move_insn (destmem, regs[i]);
   10186        17700 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10187              :     }
   10188         2950 :   emit_move_insn (destmem, regs[i]);
   10189         2950 : }
   10190              : 
   10191              : /* Expand memmove of size between (MOVES / 2) * mode size and
   10192              :    MOVES * mode size with overlapping load and store.  MOVES is even.
   10193              :    MOVES >= 2 and MOVES <= 8.  */
   10194              : 
   10195              : static void
   10196        14925 : ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg,
   10197              :                                        rtx srcreg, rtx count_exp,
   10198              :                                        machine_mode mode,
   10199              :                                        unsigned int moves)
   10200              : {
   10201        14925 :   gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0);
   10202              : 
   10203        14925 :   unsigned int half_moves = moves / 2;
   10204        14925 :   unsigned int i, j;
   10205        14925 :   rtx tmp[8];
   10206              : 
   10207        57659 :   for (i = 0; i < moves; i++)
   10208        42734 :     tmp[i] = gen_reg_rtx (mode);
   10209              : 
   10210        14925 :   rtx base_srcmem = change_address (src, mode, srcreg);
   10211              : 
   10212              :   /* Load the first half.  */
   10213        14925 :   rtx srcmem = base_srcmem;
   10214        36292 :   for (i = 0; i < half_moves - 1; i++)
   10215              :     {
   10216         6442 :       emit_move_insn (tmp[i], srcmem);
   10217        12884 :       srcmem = offset_address (srcmem,
   10218         6442 :                                GEN_INT (GET_MODE_SIZE (mode)),
   10219         6442 :                                GET_MODE_SIZE (mode));
   10220              :     }
   10221        14925 :   emit_move_insn (tmp[i], srcmem);
   10222              : 
   10223              :   /* Load the second half.  */
   10224        14925 :   srcmem = offset_address (base_srcmem, count_exp, 1);
   10225        14925 :   srcmem = offset_address (srcmem,
   10226        14925 :                            GEN_INT (-GET_MODE_SIZE (mode)),
   10227        14925 :                            GET_MODE_SIZE (mode));
   10228        36292 :   for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
   10229              :     {
   10230         6442 :       emit_move_insn (tmp[j], srcmem);
   10231        12884 :       srcmem = offset_address (srcmem,
   10232         6442 :                                GEN_INT (-GET_MODE_SIZE (mode)),
   10233         6442 :                                GET_MODE_SIZE (mode));
   10234              :     }
   10235        14925 :   emit_move_insn (tmp[j], srcmem);
   10236              : 
   10237        14925 :   rtx base_destmem = change_address (dst, mode, destreg);
   10238              : 
   10239              :   /* Store the first half.  */
   10240        14925 :   rtx destmem = base_destmem;
   10241        36292 :   for (i = 0; i < half_moves - 1; i++)
   10242              :     {
   10243         6442 :       emit_move_insn (destmem, tmp[i]);
   10244        12884 :       destmem = offset_address (destmem,
   10245         6442 :                                 GEN_INT (GET_MODE_SIZE (mode)),
   10246         6442 :                                 GET_MODE_SIZE (mode));
   10247              :     }
   10248        14925 :   emit_move_insn (destmem, tmp[i]);
   10249              : 
   10250              :   /* Store the second half.  */
   10251        14925 :   destmem = offset_address (base_destmem, count_exp, 1);
   10252        29850 :   destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
   10253        14925 :                             GET_MODE_SIZE (mode));
   10254        36292 :   for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
   10255              :     {
   10256         6442 :       emit_move_insn (destmem, tmp[j]);
   10257        12884 :       destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
   10258         6442 :                                 GET_MODE_SIZE (mode));
   10259              :     }
   10260        14925 :   emit_move_insn (destmem, tmp[j]);
   10261        14925 : }
   10262              : 
   10263              : /* Expand memmove of size < mode size which is <= 64.  */
   10264              : 
   10265              : static void
   10266         3342 : ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg,
   10267              :                               rtx srcreg, rtx count_exp,
   10268              :                               unsigned HOST_WIDE_INT min_size,
   10269              :                               machine_mode mode,
   10270              :                               rtx_code_label *done_label)
   10271              : {
   10272         3342 :   bool skip = false;
   10273         3342 :   machine_mode count_mode = counter_mode (count_exp);
   10274              : 
   10275         3342 :   rtx_code_label *between_32_63_label
   10276         3342 :     = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr;
   10277              :   /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64.  */
   10278            3 :   if (between_32_63_label)
   10279              :     {
   10280            3 :       if (min_size && min_size >= 32)
   10281              :         {
   10282            1 :           emit_jump_insn (gen_jump (between_32_63_label));
   10283            1 :           emit_barrier ();
   10284            1 :           skip = true;
   10285              :         }
   10286              :       else
   10287            2 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU,
   10288              :                                  nullptr, count_mode, 1,
   10289              :                                  between_32_63_label);
   10290              :     }
   10291              : 
   10292            3 :   rtx_code_label *between_16_31_label
   10293         3341 :     = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr;
   10294              :   /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31.  */
   10295            4 :   if (between_16_31_label)
   10296              :     {
   10297            4 :       if (min_size && min_size >= 16)
   10298              :         {
   10299            2 :           emit_jump_insn (gen_jump (between_16_31_label));
   10300            2 :           emit_barrier ();
   10301            2 :           skip = true;
   10302              :         }
   10303              :       else
   10304            2 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU,
   10305              :                                  nullptr, count_mode, 1,
   10306              :                                  between_16_31_label);
   10307              :     }
   10308              : 
   10309            2 :   rtx_code_label *between_8_15_label
   10310         6679 :     = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr;
   10311              :   /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15.  */
   10312         2234 :   if (between_8_15_label)
   10313              :     {
   10314         2234 :       if (min_size && min_size >= 8)
   10315              :         {
   10316          152 :           emit_jump_insn (gen_jump (between_8_15_label));
   10317          152 :           emit_barrier ();
   10318          152 :           skip = true;
   10319              :         }
   10320              :       else
   10321         2082 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU,
   10322              :                                  nullptr, count_mode, 1,
   10323              :                                  between_8_15_label);
   10324              :     }
   10325              : 
   10326          152 :   rtx_code_label *between_4_7_label
   10327         6377 :     = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr;
   10328              :   /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7.  */
   10329         2514 :   if (between_4_7_label)
   10330              :     {
   10331         2514 :       if (min_size && min_size >= 4)
   10332              :         {
   10333          180 :           emit_jump_insn (gen_jump (between_4_7_label));
   10334          180 :           emit_barrier ();
   10335          180 :           skip = true;
   10336              :         }
   10337              :       else
   10338         2334 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU,
   10339              :                                  nullptr, count_mode, 1,
   10340              :                                  between_4_7_label);
   10341              :     }
   10342              : 
   10343          180 :   rtx_code_label *between_2_3_label
   10344         6169 :     = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr;
   10345              :   /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3.  */
   10346         2854 :   if (between_2_3_label)
   10347              :     {
   10348         2854 :       if (min_size && min_size >= 2)
   10349              :         {
   10350          290 :           emit_jump_insn (gen_jump (between_2_3_label));
   10351          290 :           emit_barrier ();
   10352          290 :           skip = true;
   10353              :         }
   10354              :       else
   10355         2564 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT,
   10356              :                                  nullptr, count_mode, 1,
   10357              :                                  between_2_3_label);
   10358              :     }
   10359              : 
   10360         3342 :   if (!skip)
   10361              :     {
   10362         2717 :       rtx_code_label *zero_label
   10363         2717 :         = min_size == 0 ? gen_label_rtx () : nullptr;
   10364              :       /* Skip if size == 0.  */
   10365         1661 :       if (zero_label)
   10366         1661 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT,
   10367              :                                  nullptr, count_mode, 1,
   10368              :                                  zero_label,
   10369              :                                  profile_probability::unlikely ());
   10370              : 
   10371              :       /* Move 1 byte.  */
   10372         2717 :       rtx tmp0 = gen_reg_rtx (QImode);
   10373         2717 :       rtx srcmem = change_address (src, QImode, srcreg);
   10374         2717 :       emit_move_insn (tmp0, srcmem);
   10375         2717 :       rtx destmem = change_address (dst, QImode, destreg);
   10376         2717 :       emit_move_insn (destmem, tmp0);
   10377              : 
   10378         2717 :       if (zero_label)
   10379         1661 :         emit_label (zero_label);
   10380              : 
   10381         2717 :       emit_jump_insn (gen_jump (done_label));
   10382         2717 :       emit_barrier ();
   10383              :     }
   10384              : 
   10385         3342 :   if (between_32_63_label)
   10386              :     {
   10387            3 :       emit_label (between_32_63_label);
   10388            3 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10389              :                                              count_exp, OImode, 2);
   10390            3 :       emit_jump_insn (gen_jump (done_label));
   10391            3 :       emit_barrier ();
   10392              :     }
   10393              : 
   10394         3342 :   if (between_16_31_label)
   10395              :     {
   10396            4 :       emit_label (between_16_31_label);
   10397            4 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10398              :                                              count_exp, TImode, 2);
   10399            4 :       emit_jump_insn (gen_jump (done_label));
   10400            4 :       emit_barrier ();
   10401              :     }
   10402              : 
   10403         3342 :   if (between_8_15_label)
   10404              :     {
   10405         2234 :       emit_label (between_8_15_label);
   10406         2234 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10407              :                                              count_exp, DImode, 2);
   10408         2234 :       emit_jump_insn (gen_jump (done_label));
   10409         2234 :       emit_barrier ();
   10410              :     }
   10411              : 
   10412         3342 :   if (between_4_7_label)
   10413              :     {
   10414         2514 :       emit_label (between_4_7_label);
   10415         2514 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10416              :                                              count_exp, SImode, 2);
   10417         2514 :       emit_jump_insn (gen_jump (done_label));
   10418         2514 :       emit_barrier ();
   10419              :     }
   10420              : 
   10421         3342 :   if (between_2_3_label)
   10422              :     {
   10423         2854 :       emit_label (between_2_3_label);
   10424         2854 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10425              :                                              count_exp, HImode, 2);
   10426         2854 :       emit_jump_insn (gen_jump (done_label));
   10427         2854 :       emit_barrier ();
   10428              :     }
   10429         3342 : }
   10430              : 
   10431              : /* Expand movmem with overlapping unaligned loads and stores:
   10432              :    1. Load all sources into registers and store them together to avoid
   10433              :       possible address overlap between source and destination.
   10434              :    2. For known size, first try to fully unroll with 8 registers.
   10435              :    3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first
   10436              :       and then store them together.
   10437              :    4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources
   10438              :       into 4 registers first and then store them together.
   10439              :    5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources
   10440              :       into 8 registers first and then store them together.
   10441              :    6. For size > 8 * MOVE_MAX,
   10442              :       a. If address of destination > address of source, copy backward
   10443              :          with a 4 * MOVE_MAX loop with unaligned loads and stores.  Load
   10444              :          the first 4 * MOVE_MAX into 4 registers before the loop and
   10445              :          store them after the loop to support overlapping addresses.
   10446              :       b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned
   10447              :          loads and stores.  Load the last 4 * MOVE_MAX into 4 registers
   10448              :          before the loop and store them after the loop to support
   10449              :          overlapping addresses.
   10450              :  */
   10451              : 
   10452              : bool
   10453        16815 : ix86_expand_movmem (rtx operands[])
   10454              : {
   10455              :   /* Since there are much less registers available in 32-bit mode, don't
   10456              :      inline movmem in 32-bit mode.  */
   10457        16815 :   if (!TARGET_64BIT)
   10458              :     return false;
   10459              : 
   10460        14421 :   rtx dst = operands[0];
   10461        14421 :   rtx src = operands[1];
   10462        14421 :   rtx count_exp = operands[2];
   10463        14421 :   rtx expected_size_exp = operands[5];
   10464        14421 :   rtx min_size_exp = operands[6];
   10465        14421 :   rtx probable_max_size_exp = operands[8];
   10466        14421 :   unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U;
   10467        14421 :   HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U;
   10468        14421 :   unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
   10469        14421 :   unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
   10470              : 
   10471        14421 :   if (CONST_INT_P (count_exp))
   10472              :     {
   10473         2288 :       min_size = probable_max_size = count = expected_size
   10474         2288 :         = INTVAL (count_exp);
   10475              :       /* When COUNT is 0, there is nothing to do.  */
   10476         2288 :       if (!count)
   10477              :         return true;
   10478              :     }
   10479              :   else
   10480              :     {
   10481        12133 :       if (min_size_exp)
   10482        12133 :         min_size = INTVAL (min_size_exp);
   10483        12133 :       if (probable_max_size_exp)
   10484         9221 :         probable_max_size = INTVAL (probable_max_size_exp);
   10485        12133 :       if (CONST_INT_P (expected_size_exp))
   10486        12133 :         expected_size = INTVAL (expected_size_exp);
   10487              :      }
   10488              : 
   10489              :   /* Make sure we don't need to care about overflow later on.  */
   10490        14421 :   if (count > (HOST_WIDE_INT_1U << 30))
   10491              :     return false;
   10492              : 
   10493        14385 :   addr_space_t dst_as = MEM_ADDR_SPACE (dst);
   10494        14385 :   addr_space_t src_as = MEM_ADDR_SPACE (src);
   10495        14385 :   int dynamic_check;
   10496        14385 :   bool noalign;
   10497        14385 :   enum stringop_alg alg = decide_alg (count, expected_size, min_size,
   10498              :                                       probable_max_size, false, false,
   10499              :                                       dst_as, src_as, &dynamic_check,
   10500              :                                       &noalign, false);
   10501        14385 :   if (alg == libcall)
   10502              :     return false;
   10503              : 
   10504         6118 :   rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
   10505         6118 :   rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
   10506              : 
   10507         6118 :   unsigned int move_max = MOVE_MAX;
   10508         6118 :   machine_mode mode = smallest_int_mode_for_size
   10509         6118 :     (move_max * BITS_PER_UNIT).require ();
   10510         6118 :   if (probable_max_size && probable_max_size < move_max)
   10511              :     {
   10512              :       /* Get a usable MOVE_MAX.  */
   10513         3287 :       mode = smallest_int_mode_for_size
   10514         3287 :         (probable_max_size * BITS_PER_UNIT).require ();
   10515              :       /* Reduce MOVE_MAX by half so that MOVE_MAX can be used.  */
   10516         6574 :       if (GET_MODE_SIZE (mode) > probable_max_size)
   10517         2796 :         mode = smallest_int_mode_for_size
   10518         2796 :           (GET_MODE_BITSIZE (mode) / 2).require ();
   10519         6574 :       move_max = GET_MODE_SIZE (mode);
   10520              :     }
   10521              : 
   10522              :   /* Try to fully unroll memmove of known size first.  */
   10523         6118 :   if (count
   10524         6118 :       && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count,
   10525              :                                     mode))
   10526              :     return true;
   10527              : 
   10528         3984 :   rtx_code_label *done_label = gen_label_rtx ();
   10529              : 
   10530         3984 :   rtx_code_label *less_vec_label = nullptr;
   10531         3984 :   if (min_size == 0 || min_size < move_max)
   10532         3342 :     less_vec_label = gen_label_rtx ();
   10533              : 
   10534         3984 :   machine_mode count_mode = counter_mode (count_exp);
   10535              : 
   10536              :   /* Jump to LESS_VEC_LABEL if size < MOVE_MAX.  */
   10537         3984 :   if (less_vec_label)
   10538         3342 :     emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU,
   10539              :                              nullptr, count_mode, 1,
   10540              :                              less_vec_label);
   10541              : 
   10542         3984 :   rtx_code_label *more_2x_vec_label = nullptr;
   10543         3984 :   if (probable_max_size == 0 || probable_max_size > 2 * move_max)
   10544         1831 :     more_2x_vec_label = gen_label_rtx ();
   10545              : 
   10546              :   /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX.  */
   10547         1831 :   if (more_2x_vec_label)
   10548         1831 :     emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU,
   10549              :                              nullptr, count_mode, 1,
   10550              :                              more_2x_vec_label);
   10551              : 
   10552         3984 :   if (min_size == 0 || min_size <= 2 * move_max)
   10553              :     {
   10554              :       /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX.  */
   10555         3960 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10556              :                                              count_exp, mode, 2);
   10557         3960 :       emit_jump_insn (gen_jump (done_label));
   10558         3960 :       emit_barrier ();
   10559              :     }
   10560              : 
   10561         3984 :   if (less_vec_label)
   10562              :     {
   10563              :       /* Size < MOVE_MAX.  */
   10564         3342 :       emit_label (less_vec_label);
   10565         3342 :       ix86_expand_less_move_movmem (dst, src, destreg, srcreg,
   10566              :                                     count_exp, min_size, mode,
   10567              :                                     done_label);
   10568         3342 :       emit_jump_insn (gen_jump (done_label));
   10569         3342 :       emit_barrier ();
   10570              :     }
   10571              : 
   10572         3984 :   if (more_2x_vec_label)
   10573              :     {
   10574              :       /* Size > 2 * MOVE_MAX and destination may overlap with source.  */
   10575         1831 :       emit_label (more_2x_vec_label);
   10576              : 
   10577         1831 :       rtx_code_label *more_8x_vec_label = nullptr;
   10578         1831 :       if (probable_max_size == 0 || probable_max_size > 8 * move_max)
   10579         1475 :         more_8x_vec_label = gen_label_rtx ();
   10580              : 
   10581              :       /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX.  */
   10582         1475 :       if (more_8x_vec_label)
   10583         1475 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU,
   10584              :                                  nullptr, count_mode, 1,
   10585              :                                  more_8x_vec_label);
   10586              : 
   10587         1831 :       rtx_code_label *last_4x_vec_label = nullptr;
   10588         1831 :       if (min_size == 0 || min_size < 4 * move_max)
   10589         1813 :         last_4x_vec_label = gen_label_rtx ();
   10590              : 
   10591              :       /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX.  */
   10592         1813 :       if (last_4x_vec_label)
   10593         1813 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU,
   10594              :                                  nullptr, count_mode, 1,
   10595              :                                  last_4x_vec_label);
   10596              : 
   10597         1831 :       if (probable_max_size == 0 || probable_max_size > 4 * move_max)
   10598              :         {
   10599              :           /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX.  */
   10600         1543 :           ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
   10601              :                                                  srcreg, count_exp,
   10602              :                                                  mode, 8);
   10603         1543 :           emit_jump_insn (gen_jump (done_label));
   10604         1543 :           emit_barrier ();
   10605              :         }
   10606              : 
   10607         1831 :       if (last_4x_vec_label)
   10608              :         {
   10609              :           /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX.  */
   10610         1813 :           emit_label (last_4x_vec_label);
   10611         1813 :           ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
   10612              :                                                  srcreg, count_exp,
   10613              :                                                  mode, 4);
   10614         1813 :           emit_jump_insn (gen_jump (done_label));
   10615         1813 :           emit_barrier ();
   10616              :         }
   10617              : 
   10618         1831 :       if (more_8x_vec_label)
   10619              :         {
   10620              :           /* Size > 8 * MOVE_MAX.  */
   10621         1475 :           emit_label (more_8x_vec_label);
   10622              : 
   10623         1475 :           rtx loop_count = gen_reg_rtx (count_mode);
   10624         1475 :           emit_move_insn (loop_count, count_exp);
   10625              : 
   10626              :           /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is
   10627              :              lower than destination address.  */
   10628         1475 :           rtx_code_label *more_8x_vec_backward_label = gen_label_rtx ();
   10629         1475 :           emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr,
   10630         1475 :                                    GET_MODE (destreg), 1,
   10631              :                                    more_8x_vec_backward_label);
   10632              : 
   10633              :           /* Skip if source == destination which is less common.  */
   10634         1475 :           emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr,
   10635         1475 :                                    GET_MODE (destreg), 1, done_label,
   10636              :                                    profile_probability::unlikely ());
   10637              : 
   10638         1475 :           rtx base_destreg = gen_reg_rtx (GET_MODE (destreg));
   10639         1475 :           emit_move_insn (base_destreg, destreg);
   10640              : 
   10641              :           /* Load the last 4 * MOVE_MAX.  */
   10642         1475 :           rtx regs[4];
   10643         1475 :           ix86_expand_load_movmem (src, srcreg, count_exp, mode,
   10644              :                                    ARRAY_SIZE (regs), regs, true);
   10645              : 
   10646         1475 :           rtx srcmem = change_address (src, mode, srcreg);
   10647         1475 :           rtx destmem = change_address (dst, mode, destreg);
   10648              : 
   10649              :           /* Copy forward with a 4 * MOVE_MAX loop.  */
   10650         1475 :           rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx ();
   10651         1475 :           emit_label (loop_4x_vec_forward_label);
   10652              : 
   10653         1475 :           ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true);
   10654              : 
   10655         1475 :           rtx tmp;
   10656         1475 :           rtx delta = GEN_INT (4 * MOVE_MAX);
   10657              : 
   10658              :           /* Decrement LOOP_COUNT by 4 * MOVE_MAX.  */
   10659         1475 :           tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
   10660              :                                      loop_count, delta, nullptr, 1,
   10661              :                                      OPTAB_DIRECT);
   10662         1475 :           if (tmp != loop_count)
   10663         1475 :             emit_move_insn (loop_count, tmp);
   10664              : 
   10665              :           /* Increment DESTREG and SRCREG by 4 * MOVE_MAX.  */
   10666         1475 :           tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
   10667              :                                      destreg, delta, nullptr, 1,
   10668              :                                      OPTAB_DIRECT);
   10669         1475 :           if (tmp != destreg)
   10670         1475 :             emit_move_insn (destreg, tmp);
   10671         1475 :           tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
   10672              :                                      delta, nullptr, 1, OPTAB_DIRECT);
   10673         1475 :           if (tmp != srcreg)
   10674         1475 :             emit_move_insn (srcreg, tmp);
   10675              : 
   10676              :           /* Stop if LOOP_EXP <= 4 * MOVE_MAX.  */
   10677         1475 :           emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
   10678         1475 :                                    GET_MODE (loop_count), 1,
   10679              :                                    loop_4x_vec_forward_label);
   10680              : 
   10681              :           /* Store the last 4 * MOVE_MAX.  */
   10682         1475 :           ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
   10683              :                                     ARRAY_SIZE (regs), regs, true);
   10684              : 
   10685         1475 :           emit_jump_insn (gen_jump (done_label));
   10686         1475 :           emit_barrier ();
   10687              : 
   10688              :           /* Copy backward with a 4 * MOVE_MAX loop.  */
   10689         1475 :           emit_label (more_8x_vec_backward_label);
   10690              : 
   10691         1475 :           base_destreg = gen_reg_rtx (GET_MODE (destreg));
   10692         1475 :           emit_move_insn (base_destreg, destreg);
   10693              : 
   10694              :           /* Load the first 4 * MOVE_MAX.  */
   10695         1475 :           ix86_expand_load_movmem (src, srcreg, count_exp, mode,
   10696              :                                    ARRAY_SIZE (regs), regs, false);
   10697              : 
   10698              :           /* Increment DESTREG and SRCREG by COUNT_EXP.  */
   10699         1475 :           tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
   10700              :                                      destreg, count_exp, nullptr, 1,
   10701              :                                      OPTAB_DIRECT);
   10702         1475 :           if (tmp != destreg)
   10703         1475 :             emit_move_insn (destreg, tmp);
   10704         1475 :           tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
   10705              :                                      count_exp, nullptr, 1, OPTAB_DIRECT);
   10706         1475 :           if (tmp != srcreg)
   10707         1475 :             emit_move_insn (srcreg, tmp);
   10708              : 
   10709         1475 :           srcmem = change_address (src, mode, srcreg);
   10710         1475 :           destmem = change_address (dst, mode, destreg);
   10711         2950 :           rtx step = GEN_INT (-GET_MODE_SIZE (mode));
   10712         2950 :           srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10713         2950 :           destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10714              : 
   10715         1475 :           rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx ();
   10716         1475 :           emit_label (loop_4x_vec_backward_label);
   10717              : 
   10718         1475 :           ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false);
   10719              : 
   10720              :           /* Decrement LOOP_COUNT by 4 * MOVE_MAX.  */
   10721         1475 :           tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
   10722              :                                      loop_count, delta, nullptr, 1,
   10723              :                                      OPTAB_DIRECT);
   10724         1475 :           if (tmp != loop_count)
   10725         1475 :             emit_move_insn (loop_count, tmp);
   10726              : 
   10727              :           /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX.  */
   10728         1475 :           tmp = expand_simple_binop (GET_MODE (destreg), MINUS,
   10729              :                                      destreg, delta, nullptr, 1,
   10730              :                                      OPTAB_DIRECT);
   10731         1475 :           if (tmp != destreg)
   10732         1475 :             emit_move_insn (destreg, tmp);
   10733         1475 :           tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg,
   10734              :                                      delta, nullptr, 1, OPTAB_DIRECT);
   10735         1475 :           if (tmp != srcreg)
   10736         1475 :             emit_move_insn (srcreg, tmp);
   10737              : 
   10738              :           /* Stop if LOOP_EXP <= 4 * MOVE_MAX.  */
   10739         1475 :           emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
   10740         1475 :                                    GET_MODE (loop_count), 1,
   10741              :                                    loop_4x_vec_backward_label);
   10742              : 
   10743              :           /* Store the first 4 * MOVE_MAX.  */
   10744         1475 :           ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
   10745              :                                     ARRAY_SIZE (regs), regs, false);
   10746              : 
   10747         1475 :           emit_jump_insn (gen_jump (done_label));
   10748         1475 :           emit_barrier ();
   10749              :         }
   10750              :     }
   10751              : 
   10752         3984 :   emit_label (done_label);
   10753              : 
   10754         3984 :   return true;
   10755              : }
   10756              : 
   10757              : /* Expand cmpstrn or memcmp.  */
   10758              : 
   10759              : bool
   10760       170807 : ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
   10761              :                                rtx length, rtx align, bool is_cmpstrn)
   10762              : {
   10763              :   /* Expand strncmp and memcmp only with -minline-all-stringops since
   10764              :      "repz cmpsb" can be much slower than strncmp and memcmp functions
   10765              :      implemented with vector instructions, see
   10766              : 
   10767              :      https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
   10768              :    */
   10769       170807 :   if (!TARGET_INLINE_ALL_STRINGOPS)
   10770              :     return false;
   10771              : 
   10772              :   /* Can't use this if the user has appropriated ecx, esi or edi.  */
   10773         5780 :   if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
   10774              :     return false;
   10775              : 
   10776         5780 :   if (is_cmpstrn)
   10777              :     {
   10778              :       /* For strncmp, length is the maximum length, which can be larger
   10779              :          than actual string lengths.  We can expand the cmpstrn pattern
   10780              :          to "repz cmpsb" only if one of the strings is a constant so
   10781              :          that expand_builtin_strncmp() can write the length argument to
   10782              :          be the minimum of the const string length and the actual length
   10783              :          argument.  Otherwise, "repz cmpsb" may pass the 0 byte.  */
   10784           69 :       tree t1 = MEM_EXPR (src1);
   10785           69 :       tree t2 = MEM_EXPR (src2);
   10786          138 :       if (!((t1 && TREE_CODE (t1) == MEM_REF
   10787           69 :              && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
   10788            0 :              && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
   10789              :                  == STRING_CST))
   10790           69 :             || (t2 && TREE_CODE (t2) == MEM_REF
   10791           69 :                 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
   10792           69 :                 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
   10793              :                     == STRING_CST))))
   10794              :         return false;
   10795              :     }
   10796              : 
   10797         5780 :   rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
   10798         5780 :   rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
   10799         5780 :   if (addr1 != XEXP (src1, 0))
   10800         5780 :     src1 = replace_equiv_address_nv (src1, addr1);
   10801         5780 :   if (addr2 != XEXP (src2, 0))
   10802         5780 :     src2 = replace_equiv_address_nv (src2, addr2);
   10803              : 
   10804              :   /* NB: Make a copy of the data length to avoid changing the original
   10805              :      data length by cmpstrnqi patterns.  */
   10806         5780 :   length = ix86_zero_extend_to_Pmode (length);
   10807         8673 :   rtx lengthreg = gen_reg_rtx (Pmode);
   10808         5780 :   emit_move_insn (lengthreg, length);
   10809              : 
   10810              :   /* If we are testing strict equality, we can use known alignment to
   10811              :      good advantage.  This may be possible with combine, particularly
   10812              :      once cc0 is dead.  */
   10813         5780 :   if (CONST_INT_P (length))
   10814              :     {
   10815            0 :       if (length == const0_rtx)
   10816              :         {
   10817            0 :           emit_move_insn (result, const0_rtx);
   10818            0 :           return true;
   10819              :         }
   10820            0 :       emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
   10821              :                                      src1, src2));
   10822              :     }
   10823              :   else
   10824              :     {
   10825         8673 :       emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
   10826         5780 :       emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
   10827              :                                   src1, src2));
   10828              :     }
   10829              : 
   10830         5780 :   rtx out = gen_lowpart (QImode, result);
   10831         5780 :   emit_insn (gen_cmpintqi (out));
   10832         5780 :   emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
   10833              : 
   10834         5780 :   return true;
   10835              : }
   10836              : 
   10837              : /* Expand the appropriate insns for doing strlen if not just doing
   10838              :    repnz; scasb
   10839              : 
   10840              :    out = result, initialized with the start address
   10841              :    align_rtx = alignment of the address.
   10842              :    scratch = scratch register, initialized with the startaddress when
   10843              :         not aligned, otherwise undefined
   10844              : 
   10845              :    This is just the body. It needs the initializations mentioned above and
   10846              :    some address computing at the end.  These things are done in i386.md.  */
   10847              : 
   10848              : static void
   10849           11 : ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
   10850              : {
   10851           11 :   int align;
   10852           11 :   rtx tmp;
   10853           11 :   rtx_code_label *align_2_label = NULL;
   10854           11 :   rtx_code_label *align_3_label = NULL;
   10855           11 :   rtx_code_label *align_4_label = gen_label_rtx ();
   10856           11 :   rtx_code_label *end_0_label = gen_label_rtx ();
   10857           11 :   rtx mem;
   10858           11 :   rtx tmpreg = gen_reg_rtx (SImode);
   10859           11 :   rtx scratch = gen_reg_rtx (SImode);
   10860           11 :   rtx cmp;
   10861              : 
   10862           11 :   align = 0;
   10863           11 :   if (CONST_INT_P (align_rtx))
   10864           11 :     align = INTVAL (align_rtx);
   10865              : 
   10866              :   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
   10867              : 
   10868              :   /* Is there a known alignment and is it less than 4?  */
   10869           11 :   if (align < 4)
   10870              :     {
   10871           15 :       rtx scratch1 = gen_reg_rtx (Pmode);
   10872           11 :       emit_move_insn (scratch1, out);
   10873              :       /* Is there a known alignment and is it not 2? */
   10874           11 :       if (align != 2)
   10875              :         {
   10876           11 :           align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
   10877           11 :           align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
   10878              : 
   10879              :           /* Leave just the 3 lower bits.  */
   10880           15 :           align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
   10881              :                                     NULL_RTX, 0, OPTAB_WIDEN);
   10882              : 
   10883           15 :           emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
   10884           11 :                                    Pmode, 1, align_4_label);
   10885           15 :           emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
   10886           11 :                                    Pmode, 1, align_2_label);
   10887           15 :           emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
   10888           11 :                                    Pmode, 1, align_3_label);
   10889              :         }
   10890              :       else
   10891              :         {
   10892              :           /* Since the alignment is 2, we have to check 2 or 0 bytes;
   10893              :              check if is aligned to 4 - byte.  */
   10894              : 
   10895            0 :           align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
   10896              :                                     NULL_RTX, 0, OPTAB_WIDEN);
   10897              : 
   10898            0 :           emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
   10899            0 :                                    Pmode, 1, align_4_label);
   10900              :         }
   10901              : 
   10902           11 :       mem = change_address (src, QImode, out);
   10903              : 
   10904              :       /* Now compare the bytes.  */
   10905              : 
   10906              :       /* Compare the first n unaligned byte on a byte per byte basis.  */
   10907           11 :       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
   10908              :                                QImode, 1, end_0_label);
   10909              : 
   10910              :       /* Increment the address.  */
   10911           11 :       emit_insn (gen_add2_insn (out, const1_rtx));
   10912              : 
   10913              :       /* Not needed with an alignment of 2 */
   10914           11 :       if (align != 2)
   10915              :         {
   10916           11 :           emit_label (align_2_label);
   10917              : 
   10918           11 :           emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
   10919              :                                    end_0_label);
   10920              : 
   10921           11 :           emit_insn (gen_add2_insn (out, const1_rtx));
   10922              : 
   10923           11 :           emit_label (align_3_label);
   10924              :         }
   10925              : 
   10926           11 :       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
   10927              :                                end_0_label);
   10928              : 
   10929           11 :       emit_insn (gen_add2_insn (out, const1_rtx));
   10930              :     }
   10931              : 
   10932              :   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
   10933              :      align this loop.  It gives only huge programs, but does not help to
   10934              :      speed up.  */
   10935           11 :   emit_label (align_4_label);
   10936              : 
   10937           11 :   mem = change_address (src, SImode, out);
   10938           11 :   emit_move_insn (scratch, mem);
   10939           11 :   emit_insn (gen_add2_insn (out, GEN_INT (4)));
   10940              : 
   10941              :   /* This formula yields a nonzero result iff one of the bytes is zero.
   10942              :      This saves three branches inside loop and many cycles.  */
   10943              : 
   10944           11 :   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
   10945           11 :   emit_insn (gen_one_cmplsi2 (scratch, scratch));
   10946           11 :   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
   10947           11 :   emit_insn (gen_andsi3 (tmpreg, tmpreg,
   10948              :                          gen_int_mode (0x80808080, SImode)));
   10949           11 :   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
   10950              :                            align_4_label);
   10951              : 
   10952           11 :   if (TARGET_CMOVE)
   10953              :     {
   10954           11 :        rtx reg = gen_reg_rtx (SImode);
   10955           15 :        rtx reg2 = gen_reg_rtx (Pmode);
   10956           11 :        emit_move_insn (reg, tmpreg);
   10957           11 :        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
   10958              : 
   10959              :        /* If zero is not in the first two bytes, move two bytes forward.  */
   10960           11 :        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
   10961           11 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   10962           11 :        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
   10963           11 :        emit_insn (gen_rtx_SET (tmpreg,
   10964              :                                gen_rtx_IF_THEN_ELSE (SImode, tmp,
   10965              :                                                      reg,
   10966              :                                                      tmpreg)));
   10967              :        /* Emit lea manually to avoid clobbering of flags.  */
   10968           15 :        emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
   10969              : 
   10970           11 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   10971           11 :        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
   10972           15 :        emit_insn (gen_rtx_SET (out,
   10973              :                                gen_rtx_IF_THEN_ELSE (Pmode, tmp,
   10974              :                                                      reg2,
   10975              :                                                      out)));
   10976           11 :     }
   10977              :   else
   10978              :     {
   10979            0 :        rtx_code_label *end_2_label = gen_label_rtx ();
   10980              :        /* Is zero in the first two bytes? */
   10981              : 
   10982            0 :        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
   10983            0 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   10984            0 :        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
   10985            0 :        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   10986              :                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
   10987              :                             pc_rtx);
   10988            0 :        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   10989            0 :        JUMP_LABEL (tmp) = end_2_label;
   10990              : 
   10991              :        /* Not in the first two.  Move two bytes forward.  */
   10992            0 :        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
   10993            0 :        emit_insn (gen_add2_insn (out, const2_rtx));
   10994              : 
   10995            0 :        emit_label (end_2_label);
   10996              : 
   10997              :     }
   10998              : 
   10999              :   /* Avoid branch in fixing the byte.  */
   11000           11 :   tmpreg = gen_lowpart (QImode, tmpreg);
   11001           11 :   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
   11002           11 :   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
   11003           11 :   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
   11004           15 :   emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
   11005              : 
   11006           11 :   emit_label (end_0_label);
   11007           11 : }
   11008              : 
   11009              : /* Expand strlen.  */
   11010              : 
   11011              : bool
   11012        13998 : ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
   11013              : {
   11014        13998 : if (TARGET_UNROLL_STRLEN
   11015        13998 :            && TARGET_INLINE_ALL_STRINGOPS
   11016           11 :            && eoschar == const0_rtx
   11017           11 :            && optimize > 1)
   11018              :     {
   11019              :       /* The generic case of strlen expander is long.  Avoid it's
   11020              :          expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
   11021           15 :       rtx addr = force_reg (Pmode, XEXP (src, 0));
   11022              :       /* Well it seems that some optimizer does not combine a call like
   11023              :          foo(strlen(bar), strlen(bar));
   11024              :          when the move and the subtraction is done here.  It does calculate
   11025              :          the length just once when these instructions are done inside of
   11026              :          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
   11027              :          often used and I use one fewer register for the lifetime of
   11028              :          output_strlen_unroll() this is better.  */
   11029              : 
   11030           11 :       emit_move_insn (out, addr);
   11031              : 
   11032           11 :       ix86_expand_strlensi_unroll_1 (out, src, align);
   11033              : 
   11034              :       /* strlensi_unroll_1 returns the address of the zero at the end of
   11035              :          the string, like memchr(), so compute the length by subtracting
   11036              :          the start address.  */
   11037           11 :       emit_insn (gen_sub2_insn (out, addr));
   11038           11 :       return true;
   11039              :     }
   11040              :   else
   11041              :     return false;
   11042              : }
   11043              : 
   11044              : /* For given symbol (function) construct code to compute address of it's PLT
   11045              :    entry in large x86-64 PIC model.  */
   11046              : 
   11047              : static rtx
   11048           34 : construct_plt_address (rtx symbol)
   11049              : {
   11050           34 :   rtx tmp, unspec;
   11051              : 
   11052           34 :   gcc_assert (SYMBOL_REF_P (symbol));
   11053           34 :   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
   11054           34 :   gcc_assert (Pmode == DImode);
   11055              : 
   11056           34 :   tmp = gen_reg_rtx (Pmode);
   11057           34 :   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
   11058              : 
   11059           34 :   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
   11060           34 :   emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
   11061           34 :   return tmp;
   11062              : }
   11063              : 
   11064              : /* Additional registers that are clobbered by SYSV calls.  */
   11065              : 
   11066              : static int const x86_64_ms_sysv_extra_clobbered_registers
   11067              :                  [NUM_X86_64_MS_CLOBBERED_REGS] =
   11068              : {
   11069              :   SI_REG, DI_REG,
   11070              :   XMM6_REG, XMM7_REG,
   11071              :   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
   11072              :   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
   11073              : };
   11074              : 
   11075              : rtx_insn *
   11076      6233353 : ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
   11077              :                   rtx callarg2,
   11078              :                   rtx pop, bool sibcall)
   11079              : {
   11080      6233353 :   rtx vec[3];
   11081      6233353 :   rtx use = NULL, call;
   11082      6233353 :   unsigned int vec_len = 0;
   11083      6233353 :   tree fndecl;
   11084      6233353 :   bool call_no_callee_saved_registers = false;
   11085              : 
   11086      6233353 :   if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
   11087              :     {
   11088      6049030 :       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
   11089      6049030 :       if (fndecl)
   11090              :         {
   11091      5789114 :           if (lookup_attribute ("interrupt",
   11092      5789114 :                                 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
   11093            1 :             error ("interrupt service routine cannot be called directly");
   11094      5789113 :           else if (ix86_type_no_callee_saved_registers_p (TREE_TYPE (fndecl)))
   11095      5789114 :             call_no_callee_saved_registers = true;
   11096      5789114 :           if (fndecl == current_function_decl
   11097      5789114 :               && decl_binds_to_current_def_p (fndecl))
   11098        11118 :             cfun->machine->recursive_function = true;
   11099              :         }
   11100              :     }
   11101              :   else
   11102              :     {
   11103       184323 :       if (MEM_P (fnaddr))
   11104              :         {
   11105       184323 :           tree mem_expr = MEM_EXPR (fnaddr);
   11106       184323 :           if (mem_expr != nullptr
   11107       184278 :               && TREE_CODE (mem_expr) == MEM_REF
   11108       368601 :               && ix86_type_no_callee_saved_registers_p (TREE_TYPE (mem_expr)))
   11109              :             call_no_callee_saved_registers = true;
   11110              :         }
   11111              : 
   11112              :       fndecl = NULL_TREE;
   11113              :     }
   11114              : 
   11115      6233353 :   if (pop == const0_rtx)
   11116            0 :     pop = NULL;
   11117      6233353 :   gcc_assert (!TARGET_64BIT || !pop);
   11118              : 
   11119      6233353 :   rtx addr = XEXP (fnaddr, 0);
   11120      6233353 :   if (TARGET_MACHO && !TARGET_64BIT)
   11121              :     {
   11122              : #if TARGET_MACHO
   11123              :       if (flag_pic && SYMBOL_REF_P (XEXP (fnaddr, 0)))
   11124              :         fnaddr = machopic_indirect_call_target (fnaddr);
   11125              : #endif
   11126              :     }
   11127              :   else
   11128              :     {
   11129              :       /* Static functions and indirect calls don't need the pic register.  Also,
   11130              :          check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
   11131              :          it an indirect call.  */
   11132      6233353 :       if (flag_pic
   11133       529637 :           && SYMBOL_REF_P (addr)
   11134      6736305 :           && ix86_call_use_plt_p (addr))
   11135              :         {
   11136       402347 :           if (flag_plt
   11137       402347 :               && (SYMBOL_REF_DECL (addr) == NULL_TREE
   11138       402313 :                   || !lookup_attribute ("noplt",
   11139       402313 :                                         DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
   11140              :             {
   11141       402312 :               if (!TARGET_64BIT
   11142       223508 :                   || (ix86_cmodel == CM_LARGE_PIC
   11143              :                       && DEFAULT_ABI != MS_ABI))
   11144              :                 {
   11145       536446 :                   use_reg (&use, gen_rtx_REG (Pmode,
   11146              :                                               REAL_PIC_OFFSET_TABLE_REGNUM));
   11147       178838 :                   if (ix86_use_pseudo_pic_reg ())
   11148       357642 :                     emit_move_insn (gen_rtx_REG (Pmode,
   11149       178838 :                                                  REAL_PIC_OFFSET_TABLE_REGNUM),
   11150              :                                     pic_offset_table_rtx);
   11151              :                 }
   11152              :             }
   11153           35 :           else if (!TARGET_PECOFF && !TARGET_MACHO)
   11154              :             {
   11155           35 :               if (TARGET_64BIT
   11156           35 :                   && ix86_cmodel == CM_LARGE_PIC
   11157              :                   && DEFAULT_ABI != MS_ABI)
   11158              :                 {
   11159            1 :                   fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
   11160              :                                            UNSPEC_GOT);
   11161            1 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11162            1 :                   fnaddr = force_reg (Pmode, fnaddr);
   11163            1 :                   fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
   11164              :                 }
   11165           34 :               else if (TARGET_64BIT)
   11166              :                 {
   11167           38 :                   fnaddr = gen_rtx_UNSPEC (Pmode,
   11168              :                                            gen_rtvec (1, addr),
   11169              :                                            UNSPEC_GOTPCREL);
   11170           38 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11171              :                 }
   11172              :               else
   11173              :                 {
   11174            0 :                   fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
   11175              :                                            UNSPEC_GOT);
   11176            0 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11177            0 :                   fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
   11178              :                                          fnaddr);
   11179              :                 }
   11180           39 :               fnaddr = gen_const_mem (Pmode, fnaddr);
   11181              :               /* Pmode may not be the same as word_mode for x32, which
   11182              :                  doesn't support indirect branch via 32-bit memory slot.
   11183              :                  Since x32 GOT slot is 64 bit with zero upper 32 bits,
   11184              :                  indirect branch via x32 GOT slot is OK.  */
   11185           35 :               if (GET_MODE (fnaddr) != word_mode)
   11186            4 :                 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
   11187           35 :               fnaddr = gen_rtx_MEM (QImode, fnaddr);
   11188              :             }
   11189              :         }
   11190              :     }
   11191              : 
   11192              :   /* Skip setting up RAX register for -mskip-rax-setup when there are no
   11193              :      parameters passed in vector registers.  */
   11194      6233353 :   if (TARGET_64BIT
   11195      5393571 :       && (INTVAL (callarg2) > 0
   11196      5332411 :           || (INTVAL (callarg2) == 0
   11197       318214 :               && (TARGET_SSE || !flag_skip_rax_setup))))
   11198              :     {
   11199       379372 :       rtx al = gen_rtx_REG (QImode, AX_REG);
   11200       379372 :       emit_move_insn (al, callarg2);
   11201       379372 :       use_reg (&use, al);
   11202              :     }
   11203              : 
   11204      6233353 :   if (ix86_cmodel == CM_LARGE_PIC
   11205              :       && !TARGET_PECOFF
   11206           45 :       && MEM_P (fnaddr)
   11207           45 :       && SYMBOL_REF_P (XEXP (fnaddr, 0))
   11208      6233390 :       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
   11209           34 :     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
   11210              :   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
   11211              :      branch via x32 GOT slot is OK.  */
   11212      6233319 :   else if (TARGET_X32
   11213           74 :       && MEM_P (fnaddr)
   11214           74 :       && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
   11215            8 :       && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)
   11216      6233323 :       && !TARGET_INDIRECT_BRANCH_REGISTER)
   11217              :     ;
   11218      6233319 :   else if (sibcall
   11219      6233319 :            ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
   11220      6104161 :            : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
   11221              :     {
   11222          532 :       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
   11223          532 :       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
   11224              :     }
   11225              : 
   11226              :   /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
   11227              :      mask off code pointers here.
   11228              :      TODO: also need to handle indirect jump.  */
   11229      6234403 :   if (ix86_memtag_can_tag_addresses () && !fndecl
   11230      6233377 :       && sanitize_flags_p (SANITIZE_HWADDRESS))
   11231              :     {
   11232           24 :       rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
   11233              :                                                         NULL_RTX);
   11234           24 :       fnaddr = gen_rtx_MEM (QImode, untagged_addr);
   11235              :     }
   11236              : 
   11237      6233353 :   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
   11238              : 
   11239      6233353 :   if (retval)
   11240      2464922 :     call = gen_rtx_SET (retval, call);
   11241      6233353 :   vec[vec_len++] = call;
   11242              : 
   11243      6233353 :   if (pop)
   11244              :     {
   11245       450458 :       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
   11246       225229 :       pop = gen_rtx_SET (stack_pointer_rtx, pop);
   11247       225229 :       vec[vec_len++] = pop;
   11248              :     }
   11249              : 
   11250      6233353 :   static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
   11251              : 
   11252      6233353 :   if ((cfun->machine->call_saved_registers
   11253      6233353 :        == TYPE_NO_CALLER_SAVED_REGISTERS)
   11254      6233353 :       && (!fndecl
   11255          468 :           || (!TREE_THIS_VOLATILE (fndecl)
   11256          186 :               && !lookup_attribute ("no_caller_saved_registers",
   11257          186 :                                     TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
   11258              :     {
   11259          182 :       bool is_64bit_ms_abi = (TARGET_64BIT
   11260          182 :                               && ix86_function_abi (fndecl) == MS_ABI);
   11261          182 :       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
   11262              : 
   11263              :       /* If there are no caller-saved registers, add all registers
   11264              :          that are clobbered by the call which returns.  */
   11265        16926 :       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
   11266        16744 :         if (!fixed_regs[i]
   11267         3242 :             && (ix86_call_used_regs[i] == 1
   11268         1506 :                 || (ix86_call_used_regs[i] & c_mask))
   11269         2150 :             && !STACK_REGNO_P (i)
   11270         2150 :             && !MMX_REGNO_P (i))
   11271         2150 :           clobber_reg (&use,
   11272         2150 :                        gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
   11273              :     }
   11274      5393389 :   else if (TARGET_64BIT_MS_ABI
   11275      6306574 :            && (!callarg2 || INTVAL (callarg2) != -2))
   11276              :     {
   11277              :       unsigned i;
   11278              : 
   11279       861718 :       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
   11280              :         {
   11281       795432 :           int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
   11282       795432 :           machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
   11283              : 
   11284       795432 :           clobber_reg (&use, gen_rtx_REG (mode, regno));
   11285              :         }
   11286              : 
   11287              :       /* Set here, but it may get cleared later.  */
   11288        66286 :       if (TARGET_CALL_MS2SYSV_XLOGUES)
   11289              :         {
   11290         7046 :           if (!TARGET_SSE)
   11291              :             ;
   11292              : 
   11293              :           /* Don't break hot-patched functions.  */
   11294         7046 :           else if (ix86_function_ms_hook_prologue (current_function_decl))
   11295              :             ;
   11296              : 
   11297              :           /* TODO: Cases not yet examined.  */
   11298         7046 :           else if (flag_split_stack)
   11299            0 :             warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
   11300              : 
   11301              :           else
   11302              :             {
   11303         7046 :               gcc_assert (!reload_completed);
   11304         7046 :               cfun->machine->call_ms2sysv = true;
   11305              :             }
   11306              :         }
   11307              :     }
   11308              : 
   11309      6233353 :   if (TARGET_MACHO && TARGET_64BIT && !sibcall
   11310              :       && ((SYMBOL_REF_P (addr) && !SYMBOL_REF_LOCAL_P (addr))
   11311              :           || !fndecl || TREE_PUBLIC (fndecl)))
   11312              :     {
   11313              :       /* We allow public functions defined in a TU to bind locally for PIC
   11314              :          code (the default) on 64bit Mach-O.
   11315              :          If such functions are not inlined, we cannot tell at compile-time if
   11316              :          they will be called via the lazy symbol resolver (this can depend on
   11317              :          options given at link-time).  Therefore, we must assume that the lazy
   11318              :          resolver could be used which clobbers R11 and R10.  */
   11319              :       clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
   11320              :       clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
   11321              :     }
   11322              : 
   11323      6233353 :   if (call_no_callee_saved_registers)
   11324              :     {
   11325              :       /* After calling a no_callee_saved_registers function, all
   11326              :          registers may be clobbered.  Clobber all registers that are
   11327              :          not used by the callee.  */
   11328           59 :       bool is_64bit_ms_abi = (TARGET_64BIT
   11329           59 :                               && ix86_function_abi (fndecl) == MS_ABI);
   11330           59 :       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
   11331         5487 :       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
   11332         5428 :         if (!fixed_regs[i]
   11333         2597 :             && i != HARD_FRAME_POINTER_REGNUM
   11334         2538 :             && !(ix86_call_used_regs[i] == 1
   11335          973 :                  || (ix86_call_used_regs[i] & c_mask))
   11336          295 :             && !STACK_REGNO_P (i)
   11337          295 :             && !MMX_REGNO_P (i))
   11338          295 :           clobber_reg (&use,
   11339          295 :                        gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
   11340              :     }
   11341              : 
   11342      6233353 :   if (vec_len > 1)
   11343       225229 :     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
   11344      6233353 :   rtx_insn *call_insn = emit_call_insn (call);
   11345      6233353 :   if (use)
   11346       598638 :     CALL_INSN_FUNCTION_USAGE (call_insn) = use;
   11347              : 
   11348      6233353 :   return call_insn;
   11349              : }
   11350              : 
   11351              : /* Split simple return with popping POPC bytes from stack to indirect
   11352              :    branch with stack adjustment .  */
   11353              : 
   11354              : void
   11355            0 : ix86_split_simple_return_pop_internal (rtx popc)
   11356              : {
   11357            0 :   struct machine_function *m = cfun->machine;
   11358            0 :   rtx ecx = gen_rtx_REG (SImode, CX_REG);
   11359            0 :   rtx_insn *insn;
   11360              : 
   11361              :   /* There is no "pascal" calling convention in any 64bit ABI.  */
   11362            0 :   gcc_assert (!TARGET_64BIT);
   11363              : 
   11364            0 :   insn = emit_insn (gen_pop (ecx));
   11365            0 :   m->fs.cfa_offset -= UNITS_PER_WORD;
   11366            0 :   m->fs.sp_offset -= UNITS_PER_WORD;
   11367              : 
   11368            0 :   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
   11369            0 :   x = gen_rtx_SET (stack_pointer_rtx, x);
   11370            0 :   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
   11371            0 :   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
   11372            0 :   RTX_FRAME_RELATED_P (insn) = 1;
   11373              : 
   11374            0 :   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
   11375            0 :   x = gen_rtx_SET (stack_pointer_rtx, x);
   11376            0 :   insn = emit_insn (x);
   11377            0 :   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
   11378            0 :   RTX_FRAME_RELATED_P (insn) = 1;
   11379              : 
   11380              :   /* Now return address is in ECX.  */
   11381            0 :   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
   11382            0 : }
   11383              : 
   11384              : /* Errors in the source file can cause expand_expr to return const0_rtx
   11385              :    where we expect a vector.  To avoid crashing, use one of the vector
   11386              :    clear instructions.  */
   11387              : 
   11388              : static rtx
   11389       197989 : safe_vector_operand (rtx x, machine_mode mode)
   11390              : {
   11391            0 :   if (x == const0_rtx)
   11392            0 :     x = CONST0_RTX (mode);
   11393           24 :   return x;
   11394              : }
   11395              : 
   11396              : /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
   11397              : 
   11398              : static rtx
   11399         8997 : ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
   11400              : {
   11401         8997 :   rtx pat;
   11402         8997 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11403         8997 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11404         8997 :   rtx op0 = expand_normal (arg0);
   11405         8997 :   rtx op1 = expand_normal (arg1);
   11406         8997 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11407         8997 :   machine_mode mode0 = insn_data[icode].operand[1].mode;
   11408         8997 :   machine_mode mode1 = insn_data[icode].operand[2].mode;
   11409              : 
   11410         8997 :   if (VECTOR_MODE_P (mode0))
   11411         8986 :     op0 = safe_vector_operand (op0, mode0);
   11412         8997 :   if (VECTOR_MODE_P (mode1))
   11413         8850 :     op1 = safe_vector_operand (op1, mode1);
   11414              : 
   11415         2852 :   if (optimize || !target
   11416         2852 :       || GET_MODE (target) != tmode
   11417        11849 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11418         6198 :     target = gen_reg_rtx (tmode);
   11419              : 
   11420         8997 :   if (GET_MODE (op1) == SImode && mode1 == TImode)
   11421              :     {
   11422            0 :       rtx x = gen_reg_rtx (V4SImode);
   11423            0 :       emit_insn (gen_sse2_loadd (x, op1));
   11424            0 :       op1 = gen_lowpart (TImode, x);
   11425              :     }
   11426              : 
   11427         8997 :   if (!insn_data[icode].operand[1].predicate (op0, mode0))
   11428         1409 :     op0 = copy_to_mode_reg (mode0, op0);
   11429         8997 :   if (!insn_data[icode].operand[2].predicate (op1, mode1))
   11430          817 :     op1 = copy_to_mode_reg (mode1, op1);
   11431              : 
   11432         8997 :   pat = GEN_FCN (icode) (target, op0, op1);
   11433         8997 :   if (! pat)
   11434              :     return 0;
   11435              : 
   11436         8997 :   emit_insn (pat);
   11437              : 
   11438         8997 :   return target;
   11439              : }
   11440              : 
   11441              : /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
   11442              : 
   11443              : static rtx
   11444         1815 : ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
   11445              :                                enum ix86_builtin_func_type m_type,
   11446              :                                enum rtx_code sub_code)
   11447              : {
   11448         1815 :   rtx pat;
   11449         1815 :   unsigned int i, nargs;
   11450         1815 :   bool comparison_p = false;
   11451         1815 :   bool tf_p = false;
   11452         1815 :   bool last_arg_constant = false;
   11453         1815 :   int num_memory = 0;
   11454         1815 :   rtx xops[4];
   11455              : 
   11456         1815 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11457              : 
   11458         1815 :   switch (m_type)
   11459              :     {
   11460              :     case MULTI_ARG_4_DF2_DI_I:
   11461              :     case MULTI_ARG_4_DF2_DI_I1:
   11462              :     case MULTI_ARG_4_SF2_SI_I:
   11463              :     case MULTI_ARG_4_SF2_SI_I1:
   11464              :       nargs = 4;
   11465              :       last_arg_constant = true;
   11466              :       break;
   11467              : 
   11468          844 :     case MULTI_ARG_3_SF:
   11469          844 :     case MULTI_ARG_3_DF:
   11470          844 :     case MULTI_ARG_3_SF2:
   11471          844 :     case MULTI_ARG_3_DF2:
   11472          844 :     case MULTI_ARG_3_DI:
   11473          844 :     case MULTI_ARG_3_SI:
   11474          844 :     case MULTI_ARG_3_SI_DI:
   11475          844 :     case MULTI_ARG_3_HI:
   11476          844 :     case MULTI_ARG_3_HI_SI:
   11477          844 :     case MULTI_ARG_3_QI:
   11478          844 :     case MULTI_ARG_3_DI2:
   11479          844 :     case MULTI_ARG_3_SI2:
   11480          844 :     case MULTI_ARG_3_HI2:
   11481          844 :     case MULTI_ARG_3_QI2:
   11482          844 :       nargs = 3;
   11483          844 :       break;
   11484              : 
   11485          128 :     case MULTI_ARG_2_SF:
   11486          128 :     case MULTI_ARG_2_DF:
   11487          128 :     case MULTI_ARG_2_DI:
   11488          128 :     case MULTI_ARG_2_SI:
   11489          128 :     case MULTI_ARG_2_HI:
   11490          128 :     case MULTI_ARG_2_QI:
   11491          128 :       nargs = 2;
   11492          128 :       break;
   11493              : 
   11494           64 :     case MULTI_ARG_2_DI_IMM:
   11495           64 :     case MULTI_ARG_2_SI_IMM:
   11496           64 :     case MULTI_ARG_2_HI_IMM:
   11497           64 :     case MULTI_ARG_2_QI_IMM:
   11498           64 :       nargs = 2;
   11499           64 :       last_arg_constant = true;
   11500           64 :       break;
   11501              : 
   11502          187 :     case MULTI_ARG_1_SF:
   11503          187 :     case MULTI_ARG_1_DF:
   11504          187 :     case MULTI_ARG_1_SF2:
   11505          187 :     case MULTI_ARG_1_DF2:
   11506          187 :     case MULTI_ARG_1_DI:
   11507          187 :     case MULTI_ARG_1_SI:
   11508          187 :     case MULTI_ARG_1_HI:
   11509          187 :     case MULTI_ARG_1_QI:
   11510          187 :     case MULTI_ARG_1_SI_DI:
   11511          187 :     case MULTI_ARG_1_HI_DI:
   11512          187 :     case MULTI_ARG_1_HI_SI:
   11513          187 :     case MULTI_ARG_1_QI_DI:
   11514          187 :     case MULTI_ARG_1_QI_SI:
   11515          187 :     case MULTI_ARG_1_QI_HI:
   11516          187 :       nargs = 1;
   11517          187 :       break;
   11518              : 
   11519          384 :     case MULTI_ARG_2_DI_CMP:
   11520          384 :     case MULTI_ARG_2_SI_CMP:
   11521          384 :     case MULTI_ARG_2_HI_CMP:
   11522          384 :     case MULTI_ARG_2_QI_CMP:
   11523          384 :       nargs = 2;
   11524          384 :       comparison_p = true;
   11525          384 :       break;
   11526              : 
   11527          128 :     case MULTI_ARG_2_SF_TF:
   11528          128 :     case MULTI_ARG_2_DF_TF:
   11529          128 :     case MULTI_ARG_2_DI_TF:
   11530          128 :     case MULTI_ARG_2_SI_TF:
   11531          128 :     case MULTI_ARG_2_HI_TF:
   11532          128 :     case MULTI_ARG_2_QI_TF:
   11533          128 :       nargs = 2;
   11534          128 :       tf_p = true;
   11535          128 :       break;
   11536              : 
   11537            0 :     default:
   11538            0 :       gcc_unreachable ();
   11539              :     }
   11540              : 
   11541          628 :   if (optimize || !target
   11542          628 :       || GET_MODE (target) != tmode
   11543         2419 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11544         1211 :     target = gen_reg_rtx (tmode);
   11545          604 :   else if (memory_operand (target, tmode))
   11546            0 :     num_memory++;
   11547              : 
   11548         1815 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   11549              : 
   11550         6254 :   for (i = 0; i < nargs; i++)
   11551              :     {
   11552         4447 :       tree arg = CALL_EXPR_ARG (exp, i);
   11553         4447 :       rtx op = expand_normal (arg);
   11554         4447 :       int adjust = (comparison_p) ? 1 : 0;
   11555         4447 :       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
   11556              : 
   11557         4447 :       if (last_arg_constant && i == nargs - 1)
   11558              :         {
   11559          144 :           if (!insn_data[icode].operand[i + 1].predicate (op, mode))
   11560              :             {
   11561           30 :               enum insn_code new_icode = icode;
   11562           30 :               switch (icode)
   11563              :                 {
   11564            8 :                 case CODE_FOR_xop_vpermil2v2df3:
   11565            8 :                 case CODE_FOR_xop_vpermil2v4sf3:
   11566            8 :                 case CODE_FOR_xop_vpermil2v4df3:
   11567            8 :                 case CODE_FOR_xop_vpermil2v8sf3:
   11568            8 :                   error ("the last argument must be a 2-bit immediate");
   11569            8 :                   return gen_reg_rtx (tmode);
   11570            5 :                 case CODE_FOR_xop_rotlv2di3:
   11571            5 :                   new_icode = CODE_FOR_rotlv2di3;
   11572            5 :                   goto xop_rotl;
   11573            5 :                 case CODE_FOR_xop_rotlv4si3:
   11574            5 :                   new_icode = CODE_FOR_rotlv4si3;
   11575            5 :                   goto xop_rotl;
   11576            6 :                 case CODE_FOR_xop_rotlv8hi3:
   11577            6 :                   new_icode = CODE_FOR_rotlv8hi3;
   11578            6 :                   goto xop_rotl;
   11579              :                 case CODE_FOR_xop_rotlv16qi3:
   11580              :                   new_icode = CODE_FOR_rotlv16qi3;
   11581           22 :                 xop_rotl:
   11582           22 :                   if (CONST_INT_P (op))
   11583              :                     {
   11584            6 :                       int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
   11585            6 :                       op = GEN_INT (INTVAL (op) & mask);
   11586            6 :                       gcc_checking_assert
   11587              :                         (insn_data[icode].operand[i + 1].predicate (op, mode));
   11588              :                     }
   11589              :                   else
   11590              :                     {
   11591           16 :                       gcc_checking_assert
   11592              :                         (nargs == 2
   11593              :                          && insn_data[new_icode].operand[0].mode == tmode
   11594              :                          && insn_data[new_icode].operand[1].mode == tmode
   11595              :                          && insn_data[new_icode].operand[2].mode == mode
   11596              :                          && insn_data[new_icode].operand[0].predicate
   11597              :                             == insn_data[icode].operand[0].predicate
   11598              :                          && insn_data[new_icode].operand[1].predicate
   11599              :                             == insn_data[icode].operand[1].predicate);
   11600           16 :                       icode = new_icode;
   11601           16 :                       goto non_constant;
   11602              :                     }
   11603              :                   break;
   11604            0 :                 default:
   11605            0 :                   gcc_unreachable ();
   11606              :                 }
   11607              :             }
   11608              :         }
   11609              :       else
   11610              :         {
   11611         4303 :         non_constant:
   11612         4319 :           if (VECTOR_MODE_P (mode))
   11613         4303 :             op = safe_vector_operand (op, mode);
   11614              : 
   11615              :           /* If we aren't optimizing, only allow one memory operand to be
   11616              :              generated.  */
   11617         4319 :           if (memory_operand (op, mode))
   11618          826 :             num_memory++;
   11619              : 
   11620         4319 :           gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
   11621              : 
   11622         4319 :           if (optimize
   11623         1506 :               || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
   11624         5747 :               || num_memory > 1)
   11625         3398 :             op = force_reg (mode, op);
   11626              :         }
   11627              : 
   11628         4439 :       xops[i] = op;
   11629              :     }
   11630              : 
   11631         1807 :   switch (nargs)
   11632              :     {
   11633          187 :     case 1:
   11634          187 :       pat = GEN_FCN (icode) (target, xops[0]);
   11635          187 :       break;
   11636              : 
   11637          704 :     case 2:
   11638          704 :       if (tf_p)
   11639          128 :         pat = GEN_FCN (icode) (target, xops[0], xops[1],
   11640          128 :                                GEN_INT ((int)sub_code));
   11641          576 :       else if (! comparison_p)
   11642          192 :         pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   11643              :       else
   11644              :         {
   11645          384 :           rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
   11646              :                                        xops[0], xops[1]);
   11647              : 
   11648          384 :           pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
   11649              :         }
   11650              :       break;
   11651              : 
   11652          844 :     case 3:
   11653          844 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   11654          844 :       break;
   11655              : 
   11656           72 :     case 4:
   11657           72 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
   11658           72 :       break;
   11659              : 
   11660              :     default:
   11661              :       gcc_unreachable ();
   11662              :     }
   11663              : 
   11664         1807 :   if (! pat)
   11665              :     return 0;
   11666              : 
   11667         1807 :   emit_insn (pat);
   11668         1807 :   return target;
   11669              : }
   11670              : 
   11671              : /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
   11672              :    insns with vec_merge.  */
   11673              : 
   11674              : static rtx
   11675           52 : ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
   11676              :                                     rtx target)
   11677              : {
   11678           52 :   rtx pat;
   11679           52 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11680           52 :   rtx op1, op0 = expand_normal (arg0);
   11681           52 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11682           52 :   machine_mode mode0 = insn_data[icode].operand[1].mode;
   11683              : 
   11684           16 :   if (optimize || !target
   11685           16 :       || GET_MODE (target) != tmode
   11686           68 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11687           36 :     target = gen_reg_rtx (tmode);
   11688              : 
   11689           52 :   if (VECTOR_MODE_P (mode0))
   11690           52 :     op0 = safe_vector_operand (op0, mode0);
   11691              : 
   11692           36 :   if ((optimize && !register_operand (op0, mode0))
   11693           88 :       || !insn_data[icode].operand[1].predicate (op0, mode0))
   11694            0 :     op0 = copy_to_mode_reg (mode0, op0);
   11695              : 
   11696           52 :   op1 = op0;
   11697           52 :   if (!insn_data[icode].operand[2].predicate (op1, mode0))
   11698           16 :     op1 = copy_to_mode_reg (mode0, op1);
   11699              : 
   11700           52 :   pat = GEN_FCN (icode) (target, op0, op1);
   11701           52 :   if (! pat)
   11702              :     return 0;
   11703           52 :   emit_insn (pat);
   11704           52 :   return target;
   11705              : }
   11706              : 
   11707              : /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
   11708              : 
   11709              : static rtx
   11710          614 : ix86_expand_sse_compare (const struct builtin_description *d,
   11711              :                          tree exp, rtx target, bool swap)
   11712              : {
   11713          614 :   rtx pat;
   11714          614 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11715          614 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11716          614 :   rtx op0 = expand_normal (arg0);
   11717          614 :   rtx op1 = expand_normal (arg1);
   11718          614 :   rtx op2;
   11719          614 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11720          614 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11721          614 :   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
   11722          614 :   enum rtx_code comparison = d->comparison;
   11723              : 
   11724          614 :   if (VECTOR_MODE_P (mode0))
   11725          614 :     op0 = safe_vector_operand (op0, mode0);
   11726          614 :   if (VECTOR_MODE_P (mode1))
   11727          614 :     op1 = safe_vector_operand (op1, mode1);
   11728              : 
   11729              :   /* Swap operands if we have a comparison that isn't available in
   11730              :      hardware.  */
   11731          614 :   if (swap)
   11732           80 :     std::swap (op0, op1);
   11733              : 
   11734          202 :   if (optimize || !target
   11735          202 :       || GET_MODE (target) != tmode
   11736          816 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11737          412 :     target = gen_reg_rtx (tmode);
   11738              : 
   11739          412 :   if ((optimize && !register_operand (op0, mode0))
   11740          956 :       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
   11741          272 :     op0 = copy_to_mode_reg (mode0, op0);
   11742          412 :   if ((optimize && !register_operand (op1, mode1))
   11743          972 :       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
   11744           54 :     op1 = copy_to_mode_reg (mode1, op1);
   11745              : 
   11746          614 :   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
   11747          614 :   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
   11748          614 :   if (! pat)
   11749              :     return 0;
   11750          614 :   emit_insn (pat);
   11751          614 :   return target;
   11752              : }
   11753              : 
   11754              : /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
   11755              :  * ordered EQ or unordered NE, generate PF jump.  */
   11756              : 
   11757              : static rtx
   11758          646 : ix86_ssecom_setcc (const enum rtx_code comparison,
   11759              :                    bool check_unordered, machine_mode mode,
   11760              :                    rtx set_dst, rtx target)
   11761              : {
   11762              : 
   11763          646 :   rtx_code_label *label = NULL;
   11764              : 
   11765              :   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
   11766              :      with NAN operands.
   11767              :      Under TARGET_AVX10_2, VCOMX/VUCOMX are generated instead of
   11768              :      COMI/UCOMI.  VCOMX/VUCOMX will not set ZF for NAN operands.  */
   11769          646 :   if (check_unordered)
   11770              :     {
   11771          122 :       gcc_assert (comparison == EQ || comparison == NE);
   11772              : 
   11773          122 :       rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
   11774          122 :       label = gen_label_rtx ();
   11775          122 :       rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
   11776          122 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   11777              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
   11778              :                                   pc_rtx);
   11779          122 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   11780              :     }
   11781              : 
   11782              :   /* NB: Set CCFPmode and check a different CCmode which is in subset
   11783              :      of CCFPmode.  */
   11784          646 :   if (GET_MODE (set_dst) != mode)
   11785              :     {
   11786          200 :       gcc_assert (mode == CCAmode || mode == CCCmode
   11787              :                   || mode == CCOmode || mode == CCPmode
   11788              :                   || mode == CCSmode || mode == CCZmode);
   11789          200 :       set_dst = gen_rtx_REG (mode, FLAGS_REG);
   11790              :     }
   11791              : 
   11792          646 :   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   11793              :                           gen_rtx_fmt_ee (comparison, QImode,
   11794              :                                           set_dst,
   11795              :                                           const0_rtx)));
   11796              : 
   11797          646 :   if (label)
   11798          122 :     emit_label (label);
   11799              : 
   11800          646 :   return SUBREG_REG (target);
   11801              : }
   11802              : 
   11803              : /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
   11804              : 
   11805              : static rtx
   11806          547 : ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
   11807              :                       rtx target, bool comx_ok)
   11808              : {
   11809          547 :   rtx pat, set_dst;
   11810          547 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11811          547 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11812          547 :   rtx op0 = expand_normal (arg0);
   11813          547 :   rtx op1 = expand_normal (arg1);
   11814          547 :   enum insn_code icode = d->icode;
   11815          547 :   const struct insn_data_d *insn_p = &insn_data[icode];
   11816          547 :   machine_mode mode0 = insn_p->operand[0].mode;
   11817          547 :   machine_mode mode1 = insn_p->operand[1].mode;
   11818              : 
   11819          547 :   if (VECTOR_MODE_P (mode0))
   11820          547 :     op0 = safe_vector_operand (op0, mode0);
   11821          547 :   if (VECTOR_MODE_P (mode1))
   11822          547 :     op1 = safe_vector_operand (op1, mode1);
   11823              : 
   11824          547 :   enum rtx_code comparison = d->comparison;
   11825          547 :   rtx const_val = const0_rtx;
   11826              : 
   11827          547 :   bool check_unordered = false;
   11828          547 :   machine_mode mode = CCFPmode;
   11829          547 :   switch (comparison)
   11830              :     {
   11831          194 :     case LE:    /* -> GE  */
   11832          194 :     case LT:    /* -> GT  */
   11833          194 :       std::swap (op0, op1);
   11834          194 :       comparison = swap_condition (comparison);
   11835              :       /* FALLTHRU */
   11836              :     case GT:
   11837              :     case GE:
   11838              :       break;
   11839           73 :     case EQ:
   11840           73 :       if (!TARGET_AVX10_2 || !comx_ok)
   11841           45 :         check_unordered = true;
   11842              :       mode = CCZmode;
   11843              :       break;
   11844           96 :     case NE:
   11845           96 :       if (!TARGET_AVX10_2 || !comx_ok)
   11846           68 :         check_unordered = true;
   11847           96 :       mode = CCZmode;
   11848           96 :       const_val = const1_rtx;
   11849           96 :       break;
   11850            0 :     default:
   11851            0 :       gcc_unreachable ();
   11852              :     }
   11853              : 
   11854          547 :   target = gen_reg_rtx (SImode);
   11855          547 :   emit_move_insn (target, const_val);
   11856          547 :   target = gen_rtx_SUBREG (QImode, target, 0);
   11857              : 
   11858          426 :   if ((optimize && !register_operand (op0, mode0))
   11859          925 :       || !insn_p->operand[0].predicate (op0, mode0))
   11860          169 :     op0 = copy_to_mode_reg (mode0, op0);
   11861          426 :   if ((optimize && !register_operand (op1, mode1))
   11862          924 :       || !insn_p->operand[1].predicate (op1, mode1))
   11863           49 :     op1 = copy_to_mode_reg (mode1, op1);
   11864              : 
   11865          547 :   if ((comparison == EQ || comparison == NE)
   11866          169 :       && TARGET_AVX10_2 && comx_ok)
   11867              :     {
   11868           56 :       switch (icode)
   11869              :         {
   11870              :         case CODE_FOR_sse_comi:
   11871              :           icode = CODE_FOR_avx10_2_comxsf;
   11872              :           break;
   11873           14 :         case CODE_FOR_sse_ucomi:
   11874           14 :           icode = CODE_FOR_avx10_2_ucomxsf;
   11875           14 :           break;
   11876           14 :         case CODE_FOR_sse2_comi:
   11877           14 :           icode = CODE_FOR_avx10_2_comxdf;
   11878           14 :           break;
   11879           14 :         case CODE_FOR_sse2_ucomi:
   11880           14 :           icode = CODE_FOR_avx10_2_ucomxdf;
   11881           14 :           break;
   11882              : 
   11883            0 :         default:
   11884            0 :           gcc_unreachable ();
   11885              :         }
   11886              :     }
   11887          547 :   pat = GEN_FCN (icode) (op0, op1);
   11888          547 :   if (! pat)
   11889              :     return 0;
   11890              : 
   11891          547 :   set_dst = SET_DEST (pat);
   11892          547 :   emit_insn (pat);
   11893          547 :   return ix86_ssecom_setcc (comparison, check_unordered, mode,
   11894          547 :                             set_dst, target);
   11895              : }
   11896              : 
   11897              : /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
   11898              : 
   11899              : static rtx
   11900            0 : ix86_expand_sse_round (const struct builtin_description *d, tree exp,
   11901              :                        rtx target)
   11902              : {
   11903            0 :   rtx pat;
   11904            0 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11905            0 :   rtx op1, op0 = expand_normal (arg0);
   11906            0 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11907            0 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11908              : 
   11909            0 :   if (optimize || target == 0
   11910            0 :       || GET_MODE (target) != tmode
   11911            0 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11912            0 :     target = gen_reg_rtx (tmode);
   11913              : 
   11914            0 :   if (VECTOR_MODE_P (mode0))
   11915            0 :     op0 = safe_vector_operand (op0, mode0);
   11916              : 
   11917            0 :   if ((optimize && !register_operand (op0, mode0))
   11918            0 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   11919            0 :     op0 = copy_to_mode_reg (mode0, op0);
   11920              : 
   11921            0 :   op1 = GEN_INT (d->comparison);
   11922              : 
   11923            0 :   pat = GEN_FCN (d->icode) (target, op0, op1);
   11924            0 :   if (! pat)
   11925              :     return 0;
   11926            0 :   emit_insn (pat);
   11927            0 :   return target;
   11928              : }
   11929              : 
   11930              : static rtx
   11931           12 : ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
   11932              :                                      tree exp, rtx target)
   11933              : {
   11934           12 :   rtx pat;
   11935           12 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11936           12 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11937           12 :   rtx op0 = expand_normal (arg0);
   11938           12 :   rtx op1 = expand_normal (arg1);
   11939           12 :   rtx op2;
   11940           12 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11941           12 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11942           12 :   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
   11943              : 
   11944            0 :   if (optimize || target == 0
   11945            0 :       || GET_MODE (target) != tmode
   11946           12 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11947           12 :     target = gen_reg_rtx (tmode);
   11948              : 
   11949           12 :   op0 = safe_vector_operand (op0, mode0);
   11950           12 :   op1 = safe_vector_operand (op1, mode1);
   11951              : 
   11952           12 :   if ((optimize && !register_operand (op0, mode0))
   11953           12 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   11954           12 :     op0 = copy_to_mode_reg (mode0, op0);
   11955           12 :   if ((optimize && !register_operand (op1, mode1))
   11956           12 :       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
   11957           12 :     op1 = copy_to_mode_reg (mode1, op1);
   11958              : 
   11959           12 :   op2 = GEN_INT (d->comparison);
   11960              : 
   11961           12 :   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
   11962           12 :   if (! pat)
   11963              :     return 0;
   11964           12 :   emit_insn (pat);
   11965           12 :   return target;
   11966              : }
   11967              : 
   11968              : /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
   11969              : 
   11970              : static rtx
   11971          235 : ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
   11972              :                        rtx target)
   11973              : {
   11974          235 :   rtx pat;
   11975          235 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11976          235 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11977          235 :   rtx op0 = expand_normal (arg0);
   11978          235 :   rtx op1 = expand_normal (arg1);
   11979          235 :   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
   11980          235 :   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   11981          235 :   enum rtx_code comparison = d->comparison;
   11982              : 
   11983              :   /* ptest reg, reg sets the carry flag.  */
   11984          235 :   if (comparison == LTU
   11985           75 :       && (d->code == IX86_BUILTIN_PTESTC
   11986           57 :           || d->code == IX86_BUILTIN_PTESTC256)
   11987          266 :       && rtx_equal_p (op0, op1))
   11988              :     {
   11989            2 :       if (!target)
   11990            0 :         target = gen_reg_rtx (SImode);
   11991            2 :       emit_move_insn (target, const1_rtx);
   11992            2 :       return target;
   11993              :     }
   11994              : 
   11995          233 :   if (VECTOR_MODE_P (mode0))
   11996          233 :     op0 = safe_vector_operand (op0, mode0);
   11997          233 :   if (VECTOR_MODE_P (mode1))
   11998          233 :     op1 = safe_vector_operand (op1, mode1);
   11999              : 
   12000          233 :   target = gen_reg_rtx (SImode);
   12001          233 :   emit_move_insn (target, const0_rtx);
   12002          233 :   target = gen_rtx_SUBREG (QImode, target, 0);
   12003              : 
   12004          161 :   if ((optimize && !register_operand (op0, mode0))
   12005          366 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   12006          100 :     op0 = copy_to_mode_reg (mode0, op0);
   12007          161 :   if ((optimize && !register_operand (op1, mode1))
   12008          367 :       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
   12009           27 :     op1 = copy_to_mode_reg (mode1, op1);
   12010              : 
   12011          233 :   pat = GEN_FCN (d->icode) (op0, op1);
   12012          233 :   if (! pat)
   12013              :     return 0;
   12014          233 :   emit_insn (pat);
   12015          233 :   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12016              :                           gen_rtx_fmt_ee (comparison, QImode,
   12017              :                                           SET_DEST (pat),
   12018              :                                           const0_rtx)));
   12019              : 
   12020          233 :   return SUBREG_REG (target);
   12021              : }
   12022              : 
   12023              : /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
   12024              : 
   12025              : static rtx
   12026          216 : ix86_expand_sse_pcmpestr (const struct builtin_description *d,
   12027              :                           tree exp, rtx target)
   12028              : {
   12029          216 :   rtx pat;
   12030          216 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   12031          216 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   12032          216 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   12033          216 :   tree arg3 = CALL_EXPR_ARG (exp, 3);
   12034          216 :   tree arg4 = CALL_EXPR_ARG (exp, 4);
   12035          216 :   rtx scratch0, scratch1;
   12036          216 :   rtx op0 = expand_normal (arg0);
   12037          216 :   rtx op1 = expand_normal (arg1);
   12038          216 :   rtx op2 = expand_normal (arg2);
   12039          216 :   rtx op3 = expand_normal (arg3);
   12040          216 :   rtx op4 = expand_normal (arg4);
   12041          216 :   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
   12042              : 
   12043          216 :   tmode0 = insn_data[d->icode].operand[0].mode;
   12044          216 :   tmode1 = insn_data[d->icode].operand[1].mode;
   12045          216 :   modev2 = insn_data[d->icode].operand[2].mode;
   12046          216 :   modei3 = insn_data[d->icode].operand[3].mode;
   12047          216 :   modev4 = insn_data[d->icode].operand[4].mode;
   12048          216 :   modei5 = insn_data[d->icode].operand[5].mode;
   12049          216 :   modeimm = insn_data[d->icode].operand[6].mode;
   12050              : 
   12051          216 :   if (VECTOR_MODE_P (modev2))
   12052          216 :     op0 = safe_vector_operand (op0, modev2);
   12053          216 :   if (VECTOR_MODE_P (modev4))
   12054          216 :     op2 = safe_vector_operand (op2, modev4);
   12055              : 
   12056          216 :   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
   12057            6 :     op0 = copy_to_mode_reg (modev2, op0);
   12058          216 :   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
   12059           34 :     op1 = copy_to_mode_reg (modei3, op1);
   12060          160 :   if ((optimize && !register_operand (op2, modev4))
   12061          371 :       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
   12062            5 :     op2 = copy_to_mode_reg (modev4, op2);
   12063          216 :   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
   12064           34 :     op3 = copy_to_mode_reg (modei5, op3);
   12065              : 
   12066          216 :   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
   12067              :     {
   12068           21 :       error ("the fifth argument must be an 8-bit immediate");
   12069           21 :       return const0_rtx;
   12070              :     }
   12071              : 
   12072          195 :   if (d->code == IX86_BUILTIN_PCMPESTRI128)
   12073              :     {
   12074            5 :       if (optimize || !target
   12075            5 :           || GET_MODE (target) != tmode0
   12076           34 :           || !insn_data[d->icode].operand[0].predicate (target, tmode0))
   12077           24 :         target = gen_reg_rtx (tmode0);
   12078              : 
   12079           29 :       scratch1 = gen_reg_rtx (tmode1);
   12080              : 
   12081           29 :       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
   12082              :     }
   12083          166 :   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
   12084              :     {
   12085            5 :       if (optimize || !target
   12086            5 :           || GET_MODE (target) != tmode1
   12087           36 :           || !insn_data[d->icode].operand[1].predicate (target, tmode1))
   12088           26 :         target = gen_reg_rtx (tmode1);
   12089              : 
   12090           31 :       scratch0 = gen_reg_rtx (tmode0);
   12091              : 
   12092           31 :       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
   12093              :     }
   12094              :   else
   12095              :     {
   12096          135 :       gcc_assert (d->flag);
   12097              : 
   12098          135 :       scratch0 = gen_reg_rtx (tmode0);
   12099          135 :       scratch1 = gen_reg_rtx (tmode1);
   12100              : 
   12101          135 :       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
   12102              :     }
   12103              : 
   12104          195 :   if (! pat)
   12105              :     return 0;
   12106              : 
   12107          195 :   emit_insn (pat);
   12108              : 
   12109          195 :   if (d->flag)
   12110              :     {
   12111          135 :       target = gen_reg_rtx (SImode);
   12112          135 :       emit_move_insn (target, const0_rtx);
   12113          135 :       target = gen_rtx_SUBREG (QImode, target, 0);
   12114              : 
   12115          135 :       emit_insn
   12116          135 :         (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12117              :                       gen_rtx_fmt_ee (EQ, QImode,
   12118              :                                       gen_rtx_REG ((machine_mode) d->flag,
   12119              :                                                    FLAGS_REG),
   12120              :                                       const0_rtx)));
   12121          135 :       return SUBREG_REG (target);
   12122              :     }
   12123              :   else
   12124              :     return target;
   12125              : }
   12126              : 
   12127              : 
   12128              : /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
   12129              : 
   12130              : static rtx
   12131          275 : ix86_expand_sse_pcmpistr (const struct builtin_description *d,
   12132              :                           tree exp, rtx target)
   12133              : {
   12134          275 :   rtx pat;
   12135          275 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   12136          275 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   12137          275 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   12138          275 :   rtx scratch0, scratch1;
   12139          275 :   rtx op0 = expand_normal (arg0);
   12140          275 :   rtx op1 = expand_normal (arg1);
   12141          275 :   rtx op2 = expand_normal (arg2);
   12142          275 :   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
   12143              : 
   12144          275 :   tmode0 = insn_data[d->icode].operand[0].mode;
   12145          275 :   tmode1 = insn_data[d->icode].operand[1].mode;
   12146          275 :   modev2 = insn_data[d->icode].operand[2].mode;
   12147          275 :   modev3 = insn_data[d->icode].operand[3].mode;
   12148          275 :   modeimm = insn_data[d->icode].operand[4].mode;
   12149              : 
   12150          275 :   if (VECTOR_MODE_P (modev2))
   12151          275 :     op0 = safe_vector_operand (op0, modev2);
   12152          275 :   if (VECTOR_MODE_P (modev3))
   12153          275 :     op1 = safe_vector_operand (op1, modev3);
   12154              : 
   12155          275 :   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
   12156            4 :     op0 = copy_to_mode_reg (modev2, op0);
   12157          210 :   if ((optimize && !register_operand (op1, modev3))
   12158          481 :       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
   12159            4 :     op1 = copy_to_mode_reg (modev3, op1);
   12160              : 
   12161          275 :   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
   12162              :     {
   12163           21 :       error ("the third argument must be an 8-bit immediate");
   12164           21 :       return const0_rtx;
   12165              :     }
   12166              : 
   12167          254 :   if (d->code == IX86_BUILTIN_PCMPISTRI128)
   12168              :     {
   12169            5 :       if (optimize || !target
   12170            5 :           || GET_MODE (target) != tmode0
   12171           38 :           || !insn_data[d->icode].operand[0].predicate (target, tmode0))
   12172           28 :         target = gen_reg_rtx (tmode0);
   12173              : 
   12174           33 :       scratch1 = gen_reg_rtx (tmode1);
   12175              : 
   12176           33 :       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
   12177              :     }
   12178          221 :   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
   12179              :     {
   12180            8 :       if (optimize || !target
   12181            8 :           || GET_MODE (target) != tmode1
   12182           58 :           || !insn_data[d->icode].operand[1].predicate (target, tmode1))
   12183           42 :         target = gen_reg_rtx (tmode1);
   12184              : 
   12185           50 :       scratch0 = gen_reg_rtx (tmode0);
   12186              : 
   12187           50 :       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
   12188              :     }
   12189              :   else
   12190              :     {
   12191          171 :       gcc_assert (d->flag);
   12192              : 
   12193          171 :       scratch0 = gen_reg_rtx (tmode0);
   12194          171 :       scratch1 = gen_reg_rtx (tmode1);
   12195              : 
   12196          171 :       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
   12197              :     }
   12198              : 
   12199          254 :   if (! pat)
   12200              :     return 0;
   12201              : 
   12202          254 :   emit_insn (pat);
   12203              : 
   12204          254 :   if (d->flag)
   12205              :     {
   12206          171 :       target = gen_reg_rtx (SImode);
   12207          171 :       emit_move_insn (target, const0_rtx);
   12208          171 :       target = gen_rtx_SUBREG (QImode, target, 0);
   12209              : 
   12210          171 :       emit_insn
   12211          171 :         (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12212              :                       gen_rtx_fmt_ee (EQ, QImode,
   12213              :                                       gen_rtx_REG ((machine_mode) d->flag,
   12214              :                                                    FLAGS_REG),
   12215              :                                       const0_rtx)));
   12216          171 :       return SUBREG_REG (target);
   12217              :     }
   12218              :   else
   12219              :     return target;
   12220              : }
   12221              : 
   12222              : /* Fixup modeless constants to fit required mode.  */
   12223              : 
   12224              : static rtx
   12225       260811 : fixup_modeless_constant (rtx x, machine_mode mode)
   12226              : {
   12227       260811 :   if (GET_MODE (x) == VOIDmode)
   12228        41463 :     x = convert_to_mode (mode, x, 1);
   12229       260811 :   return x;
   12230              : }
   12231              : 
   12232              : /* Expand the outgoing argument ARG to extract unsigned char and short
   12233              :    integer constants suitable for the predicates and the instruction
   12234              :    templates which expect the unsigned expanded value.  */
   12235              : 
   12236              : static rtx
   12237       282055 : ix86_expand_unsigned_small_int_cst_argument (tree arg)
   12238              : {
   12239              :   /* When passing 0xff as an unsigned char function argument with the
   12240              :      C frontend promotion, expand_normal gets
   12241              : 
   12242              :      <integer_cst 0x7fffe6aa23a8 type <integer_type 0x7fffe98225e8 int> constant 255>
   12243              : 
   12244              :      and returns the rtx value using the sign-extended representation:
   12245              : 
   12246              :      (const_int 255 [0xff])
   12247              : 
   12248              :      Without the C frontend promotion, expand_normal gets
   12249              : 
   12250              :      <integer_cst 0x7fffe9824018 type <integer_type 0x7fffe9822348 unsigned char > constant 255>
   12251              : 
   12252              :      and returns
   12253              : 
   12254              :      (const_int -1 [0xffffffffffffffff])
   12255              : 
   12256              :      which doesn't work with the predicates nor the instruction templates
   12257              :      which expect the unsigned expanded value.  Extract the unsigned char
   12258              :      and short integer constants to return
   12259              : 
   12260              :      (const_int 255 [0xff])
   12261              : 
   12262              :      so that the expanded value is always unsigned, without the C frontend
   12263              :      promotion.  */
   12264              : 
   12265       282055 :   if (TREE_CODE (arg) == INTEGER_CST)
   12266              :     {
   12267        60352 :       tree type = TREE_TYPE (arg);
   12268        60352 :       if (INTEGRAL_TYPE_P (type)
   12269        60352 :           && TYPE_UNSIGNED (type)
   12270        82165 :           && TYPE_PRECISION (type) < TYPE_PRECISION (integer_type_node))
   12271              :         {
   12272        18326 :           HOST_WIDE_INT cst = TREE_INT_CST_LOW (arg);
   12273        18326 :           return GEN_INT (cst);
   12274              :         }
   12275              :     }
   12276              : 
   12277       263729 :   return expand_normal (arg);
   12278              : }
   12279              : 
   12280              : /* Subroutine of ix86_expand_builtin to take care of insns with
   12281              :    variable number of operands.  */
   12282              : 
   12283              : static rtx
   12284        70978 : ix86_expand_args_builtin (const struct builtin_description *d,
   12285              :                           tree exp, rtx target)
   12286              : {
   12287        70978 :   rtx pat, real_target;
   12288        70978 :   unsigned int i, nargs;
   12289        70978 :   unsigned int nargs_constant = 0;
   12290        70978 :   unsigned int mask_pos = 0;
   12291        70978 :   int num_memory = 0;
   12292        70978 :   rtx xops[6];
   12293        70978 :   bool second_arg_count = false;
   12294        70978 :   enum insn_code icode = d->icode;
   12295        70978 :   const struct insn_data_d *insn_p = &insn_data[icode];
   12296        70978 :   machine_mode tmode = insn_p->operand[0].mode;
   12297        70978 :   machine_mode rmode = VOIDmode;
   12298        70978 :   bool swap = false;
   12299        70978 :   enum rtx_code comparison = d->comparison;
   12300              : 
   12301        70978 :   switch ((enum ix86_builtin_func_type) d->flag)
   12302              :     {
   12303            0 :     case V2DF_FTYPE_V2DF_ROUND:
   12304            0 :     case V4DF_FTYPE_V4DF_ROUND:
   12305            0 :     case V8DF_FTYPE_V8DF_ROUND:
   12306            0 :     case V4SF_FTYPE_V4SF_ROUND:
   12307            0 :     case V8SF_FTYPE_V8SF_ROUND:
   12308            0 :     case V16SF_FTYPE_V16SF_ROUND:
   12309            0 :     case V8HF_FTYPE_V8HF_ROUND:
   12310            0 :     case V16HF_FTYPE_V16HF_ROUND:
   12311            0 :     case V32HF_FTYPE_V32HF_ROUND:
   12312            0 :     case V4SI_FTYPE_V4SF_ROUND:
   12313            0 :     case V8SI_FTYPE_V8SF_ROUND:
   12314            0 :     case V16SI_FTYPE_V16SF_ROUND:
   12315            0 :       return ix86_expand_sse_round (d, exp, target);
   12316           12 :     case V4SI_FTYPE_V2DF_V2DF_ROUND:
   12317           12 :     case V8SI_FTYPE_V4DF_V4DF_ROUND:
   12318           12 :     case V16SI_FTYPE_V8DF_V8DF_ROUND:
   12319           12 :       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
   12320          235 :     case INT_FTYPE_V8SF_V8SF_PTEST:
   12321          235 :     case INT_FTYPE_V4DI_V4DI_PTEST:
   12322          235 :     case INT_FTYPE_V4DF_V4DF_PTEST:
   12323          235 :     case INT_FTYPE_V4SF_V4SF_PTEST:
   12324          235 :     case INT_FTYPE_V2DI_V2DI_PTEST:
   12325          235 :     case INT_FTYPE_V2DF_V2DF_PTEST:
   12326          235 :       return ix86_expand_sse_ptest (d, exp, target);
   12327              :     case FLOAT128_FTYPE_FLOAT128:
   12328              :     case FLOAT_FTYPE_FLOAT:
   12329              :     case FLOAT_FTYPE_BFLOAT16:
   12330              :     case INT_FTYPE_INT:
   12331              :     case UINT_FTYPE_UINT:
   12332              :     case UINT16_FTYPE_UINT16:
   12333              :     case UINT64_FTYPE_INT:
   12334              :     case UINT64_FTYPE_UINT64:
   12335              :     case INT64_FTYPE_INT64:
   12336              :     case INT64_FTYPE_V4SF:
   12337              :     case INT64_FTYPE_V2DF:
   12338              :     case INT_FTYPE_V16QI:
   12339              :     case INT_FTYPE_V8QI:
   12340              :     case INT_FTYPE_V8SF:
   12341              :     case INT_FTYPE_V4DF:
   12342              :     case INT_FTYPE_V4SF:
   12343              :     case INT_FTYPE_V2DF:
   12344              :     case INT_FTYPE_V32QI:
   12345              :     case V16QI_FTYPE_V16QI:
   12346              :     case V8SI_FTYPE_V8SF:
   12347              :     case V8SI_FTYPE_V4SI:
   12348              :     case V8HI_FTYPE_V8HI:
   12349              :     case V8HI_FTYPE_V16QI:
   12350              :     case V8QI_FTYPE_V8QI:
   12351              :     case V8SF_FTYPE_V8SF:
   12352              :     case V8SF_FTYPE_V8SI:
   12353              :     case V8SF_FTYPE_V4SF:
   12354              :     case V8SF_FTYPE_V8HI:
   12355              :     case V4SI_FTYPE_V4SI:
   12356              :     case V4SI_FTYPE_V16QI:
   12357              :     case V4SI_FTYPE_V4SF:
   12358              :     case V4SI_FTYPE_V8SI:
   12359              :     case V4SI_FTYPE_V8HI:
   12360              :     case V4SI_FTYPE_V4DF:
   12361              :     case V4SI_FTYPE_V2DF:
   12362              :     case V4HI_FTYPE_V4HI:
   12363              :     case V4DF_FTYPE_V4DF:
   12364              :     case V4DF_FTYPE_V4SI:
   12365              :     case V4DF_FTYPE_V4SF:
   12366              :     case V4DF_FTYPE_V2DF:
   12367              :     case V4SF_FTYPE_V4SF:
   12368              :     case V4SF_FTYPE_V4SI:
   12369              :     case V4SF_FTYPE_V8SF:
   12370              :     case V4SF_FTYPE_V4DF:
   12371              :     case V4SF_FTYPE_V8HI:
   12372              :     case V4SF_FTYPE_V2DF:
   12373              :     case V2DI_FTYPE_V2DI:
   12374              :     case V2DI_FTYPE_V16QI:
   12375              :     case V2DI_FTYPE_V8HI:
   12376              :     case V2DI_FTYPE_V4SI:
   12377              :     case V2DF_FTYPE_V2DF:
   12378              :     case V2DF_FTYPE_V4SI:
   12379              :     case V2DF_FTYPE_V4DF:
   12380              :     case V2DF_FTYPE_V4SF:
   12381              :     case V2DF_FTYPE_V2SI:
   12382              :     case V2SI_FTYPE_V2SI:
   12383              :     case V2SI_FTYPE_V4SF:
   12384              :     case V2SI_FTYPE_V2SF:
   12385              :     case V2SI_FTYPE_V2DF:
   12386              :     case V2SF_FTYPE_V2SF:
   12387              :     case V2SF_FTYPE_V2SI:
   12388              :     case V32QI_FTYPE_V32QI:
   12389              :     case V32QI_FTYPE_V16QI:
   12390              :     case V16HI_FTYPE_V16HI:
   12391              :     case V16HI_FTYPE_V8HI:
   12392              :     case V8SI_FTYPE_V8SI:
   12393              :     case V16HI_FTYPE_V16QI:
   12394              :     case V8SI_FTYPE_V16QI:
   12395              :     case V4DI_FTYPE_V16QI:
   12396              :     case V8SI_FTYPE_V8HI:
   12397              :     case V4DI_FTYPE_V8HI:
   12398              :     case V4DI_FTYPE_V4SI:
   12399              :     case V4DI_FTYPE_V2DI:
   12400              :     case UQI_FTYPE_UQI:
   12401              :     case UHI_FTYPE_UHI:
   12402              :     case USI_FTYPE_USI:
   12403              :     case USI_FTYPE_UQI:
   12404              :     case USI_FTYPE_UHI:
   12405              :     case UDI_FTYPE_UDI:
   12406              :     case UHI_FTYPE_V16QI:
   12407              :     case USI_FTYPE_V32QI:
   12408              :     case UDI_FTYPE_V64QI:
   12409              :     case V16QI_FTYPE_UHI:
   12410              :     case V32QI_FTYPE_USI:
   12411              :     case V64QI_FTYPE_UDI:
   12412              :     case V8HI_FTYPE_UQI:
   12413              :     case V16HI_FTYPE_UHI:
   12414              :     case V32HI_FTYPE_USI:
   12415              :     case V4SI_FTYPE_UQI:
   12416              :     case V8SI_FTYPE_UQI:
   12417              :     case V4SI_FTYPE_UHI:
   12418              :     case V8SI_FTYPE_UHI:
   12419              :     case UQI_FTYPE_V8HI:
   12420              :     case UHI_FTYPE_V16HI:
   12421              :     case USI_FTYPE_V32HI:
   12422              :     case UQI_FTYPE_V4SI:
   12423              :     case UQI_FTYPE_V8SI:
   12424              :     case UHI_FTYPE_V16SI:
   12425              :     case UQI_FTYPE_V2DI:
   12426              :     case UQI_FTYPE_V4DI:
   12427              :     case UQI_FTYPE_V8DI:
   12428              :     case V16SI_FTYPE_UHI:
   12429              :     case V2DI_FTYPE_UQI:
   12430              :     case V4DI_FTYPE_UQI:
   12431              :     case V16SI_FTYPE_INT:
   12432              :     case V16SF_FTYPE_V8SF:
   12433              :     case V16SI_FTYPE_V8SI:
   12434              :     case V16SF_FTYPE_V4SF:
   12435              :     case V16SI_FTYPE_V4SI:
   12436              :     case V16SI_FTYPE_V16SF:
   12437              :     case V16SI_FTYPE_V16SI:
   12438              :     case V64QI_FTYPE_V64QI:
   12439              :     case V32HI_FTYPE_V32HI:
   12440              :     case V16SF_FTYPE_V16SF:
   12441              :     case V8DI_FTYPE_UQI:
   12442              :     case V8DI_FTYPE_V8DI:
   12443              :     case V8DF_FTYPE_V4DF:
   12444              :     case V8DF_FTYPE_V2DF:
   12445              :     case V8DF_FTYPE_V8DF:
   12446              :     case V4DI_FTYPE_V4DI:
   12447              :     case V16BF_FTYPE_V16SF:
   12448              :     case V8BF_FTYPE_V8SF:
   12449              :     case V8BF_FTYPE_V4SF:
   12450              :       nargs = 1;
   12451              :       break;
   12452           52 :     case V4SF_FTYPE_V4SF_VEC_MERGE:
   12453           52 :     case V2DF_FTYPE_V2DF_VEC_MERGE:
   12454           52 :       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
   12455         9531 :     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
   12456         9531 :     case V16QI_FTYPE_V16QI_V16QI:
   12457         9531 :     case V16QI_FTYPE_V8HI_V8HI:
   12458         9531 :     case V16HF_FTYPE_V16HF_V16HF:
   12459         9531 :     case V16SF_FTYPE_V16SF_V16SF:
   12460         9531 :     case V16SI_FTYPE_V16SI_V16SI:
   12461         9531 :     case V8QI_FTYPE_V8QI_V8QI:
   12462         9531 :     case V8QI_FTYPE_V4HI_V4HI:
   12463         9531 :     case V8HI_FTYPE_V8HI_V8HI:
   12464         9531 :     case V8HI_FTYPE_V16QI_V16QI:
   12465         9531 :     case V8HI_FTYPE_V4SI_V4SI:
   12466         9531 :     case V8HF_FTYPE_V8HF_V8HF:
   12467         9531 :     case V8SF_FTYPE_V8SF_V8SF:
   12468         9531 :     case V8SF_FTYPE_V8SF_V8SI:
   12469         9531 :     case V8DF_FTYPE_V8DF_V8DF:
   12470         9531 :     case V4SI_FTYPE_V4SI_V4SI:
   12471         9531 :     case V4SI_FTYPE_V8HI_V8HI:
   12472         9531 :     case V4SI_FTYPE_V2DF_V2DF:
   12473         9531 :     case V4HI_FTYPE_V4HI_V4HI:
   12474         9531 :     case V4HI_FTYPE_V8QI_V8QI:
   12475         9531 :     case V4HI_FTYPE_V2SI_V2SI:
   12476         9531 :     case V4DF_FTYPE_V4DF_V4DF:
   12477         9531 :     case V4DF_FTYPE_V4DF_V4DI:
   12478         9531 :     case V4SF_FTYPE_V4SF_V4SF:
   12479         9531 :     case V4SF_FTYPE_V4SF_V4SI:
   12480         9531 :     case V4SF_FTYPE_V4SF_V2SI:
   12481         9531 :     case V4SF_FTYPE_V4SF_V2DF:
   12482         9531 :     case V4SF_FTYPE_V4SF_UINT:
   12483         9531 :     case V4SF_FTYPE_V4SF_DI:
   12484         9531 :     case V4SF_FTYPE_V4SF_SI:
   12485         9531 :     case V4DI_FTYPE_V4DI_V2DI:
   12486         9531 :     case V2DI_FTYPE_V2DI_V2DI:
   12487         9531 :     case V2DI_FTYPE_V16QI_V16QI:
   12488         9531 :     case V2DI_FTYPE_V4SI_V4SI:
   12489         9531 :     case V2DI_FTYPE_V2DI_V16QI:
   12490         9531 :     case V2SI_FTYPE_V2SI_V2SI:
   12491         9531 :     case V2SI_FTYPE_V4HI_V4HI:
   12492         9531 :     case V2SI_FTYPE_V2SF_V2SF:
   12493         9531 :     case V2DF_FTYPE_V2DF_V2DF:
   12494         9531 :     case V2DF_FTYPE_V2DF_V4SF:
   12495         9531 :     case V2DF_FTYPE_V2DF_V2DI:
   12496         9531 :     case V2DF_FTYPE_V2DF_DI:
   12497         9531 :     case V2DF_FTYPE_V2DF_SI:
   12498         9531 :     case V2DF_FTYPE_V2DF_UINT:
   12499         9531 :     case V2SF_FTYPE_V2SF_V2SF:
   12500         9531 :     case V1DI_FTYPE_V1DI_V1DI:
   12501         9531 :     case V1DI_FTYPE_V8QI_V8QI:
   12502         9531 :     case V1DI_FTYPE_V2SI_V2SI:
   12503         9531 :     case V32QI_FTYPE_V16HI_V16HI:
   12504         9531 :     case V16HI_FTYPE_V8SI_V8SI:
   12505         9531 :     case V64QI_FTYPE_V64QI_V64QI:
   12506         9531 :     case V32QI_FTYPE_V32QI_V32QI:
   12507         9531 :     case V32BF_FTYPE_V32BF_V32BF:
   12508         9531 :     case V16BF_FTYPE_V16BF_V16BF:
   12509         9531 :     case V8BF_FTYPE_V8BF_V8BF:
   12510         9531 :     case V16HI_FTYPE_V32QI_V32QI:
   12511         9531 :     case V16HI_FTYPE_V16HI_V16HI:
   12512         9531 :     case V8SI_FTYPE_V4DF_V4DF:
   12513         9531 :     case V8SI_FTYPE_V8SI_V8SI:
   12514         9531 :     case V8SI_FTYPE_V16HI_V16HI:
   12515         9531 :     case V4DI_FTYPE_V4DI_V4DI:
   12516         9531 :     case V4DI_FTYPE_V8SI_V8SI:
   12517         9531 :     case V4DI_FTYPE_V32QI_V32QI:
   12518         9531 :     case V8DI_FTYPE_V64QI_V64QI:
   12519         9531 :       if (comparison == UNKNOWN)
   12520         8997 :         return ix86_expand_binop_builtin (icode, exp, target);
   12521              :       nargs = 2;
   12522              :       break;
   12523           80 :     case V4SF_FTYPE_V4SF_V4SF_SWAP:
   12524           80 :     case V2DF_FTYPE_V2DF_V2DF_SWAP:
   12525           80 :       gcc_assert (comparison != UNKNOWN);
   12526              :       nargs = 2;
   12527              :       swap = true;
   12528              :       break;
   12529         1481 :     case V16HI_FTYPE_V16HI_V8HI_COUNT:
   12530         1481 :     case V16HI_FTYPE_V16HI_SI_COUNT:
   12531         1481 :     case V8SI_FTYPE_V8SI_V4SI_COUNT:
   12532         1481 :     case V8SI_FTYPE_V8SI_SI_COUNT:
   12533         1481 :     case V4DI_FTYPE_V4DI_V2DI_COUNT:
   12534         1481 :     case V4DI_FTYPE_V4DI_INT_COUNT:
   12535         1481 :     case V8HI_FTYPE_V8HI_V8HI_COUNT:
   12536         1481 :     case V8HI_FTYPE_V8HI_SI_COUNT:
   12537         1481 :     case V4SI_FTYPE_V4SI_V4SI_COUNT:
   12538         1481 :     case V4SI_FTYPE_V4SI_SI_COUNT:
   12539         1481 :     case V4HI_FTYPE_V4HI_V4HI_COUNT:
   12540         1481 :     case V4HI_FTYPE_V4HI_SI_COUNT:
   12541         1481 :     case V2DI_FTYPE_V2DI_V2DI_COUNT:
   12542         1481 :     case V2DI_FTYPE_V2DI_SI_COUNT:
   12543         1481 :     case V2SI_FTYPE_V2SI_V2SI_COUNT:
   12544         1481 :     case V2SI_FTYPE_V2SI_SI_COUNT:
   12545         1481 :     case V1DI_FTYPE_V1DI_V1DI_COUNT:
   12546         1481 :     case V1DI_FTYPE_V1DI_SI_COUNT:
   12547         1481 :       nargs = 2;
   12548         1481 :       second_arg_count = true;
   12549         1481 :       break;
   12550         1408 :     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
   12551         1408 :     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
   12552         1408 :     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
   12553         1408 :     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
   12554         1408 :     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
   12555         1408 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
   12556         1408 :     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
   12557         1408 :     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
   12558         1408 :     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
   12559         1408 :     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
   12560         1408 :     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
   12561         1408 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
   12562         1408 :     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
   12563         1408 :     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
   12564         1408 :     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
   12565         1408 :     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
   12566         1408 :     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
   12567         1408 :     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
   12568         1408 :       nargs = 4;
   12569         1408 :       second_arg_count = true;
   12570         1408 :       break;
   12571          967 :     case UINT64_FTYPE_UINT64_UINT64:
   12572          967 :     case UINT_FTYPE_UINT_UINT:
   12573          967 :     case UINT_FTYPE_UINT_USHORT:
   12574          967 :     case UINT_FTYPE_UINT_UCHAR:
   12575          967 :     case UINT16_FTYPE_UINT16_INT:
   12576          967 :     case UINT8_FTYPE_UINT8_INT:
   12577          967 :     case UQI_FTYPE_UQI_UQI:
   12578          967 :     case UHI_FTYPE_UHI_UHI:
   12579          967 :     case USI_FTYPE_USI_USI:
   12580          967 :     case UDI_FTYPE_UDI_UDI:
   12581          967 :     case V16SI_FTYPE_V8DF_V8DF:
   12582          967 :     case V32BF_FTYPE_V16SF_V16SF:
   12583          967 :     case V16BF_FTYPE_V8SF_V8SF:
   12584          967 :     case V8BF_FTYPE_V4SF_V4SF:
   12585          967 :     case V16BF_FTYPE_V16SF_UHI:
   12586          967 :     case V8BF_FTYPE_V8SF_UQI:
   12587          967 :     case V8BF_FTYPE_V4SF_UQI:
   12588          967 :     case V16QI_FTYPE_V16QI_V8HF:
   12589          967 :       nargs = 2;
   12590          967 :       break;
   12591          811 :     case V2DI_FTYPE_V2DI_INT_CONVERT:
   12592          811 :       nargs = 2;
   12593          811 :       rmode = V1TImode;
   12594          811 :       nargs_constant = 1;
   12595          811 :       break;
   12596           42 :     case V4DI_FTYPE_V4DI_INT_CONVERT:
   12597           42 :       nargs = 2;
   12598           42 :       rmode = V2TImode;
   12599           42 :       nargs_constant = 1;
   12600           42 :       break;
   12601           16 :     case V8DI_FTYPE_V8DI_INT_CONVERT:
   12602           16 :       nargs = 2;
   12603           16 :       rmode = V4TImode;
   12604           16 :       nargs_constant = 1;
   12605           16 :       break;
   12606         2422 :     case V8HI_FTYPE_V8HI_INT:
   12607         2422 :     case V8HI_FTYPE_V8SF_INT:
   12608         2422 :     case V16HI_FTYPE_V16SF_INT:
   12609         2422 :     case V8HI_FTYPE_V4SF_INT:
   12610         2422 :     case V8SF_FTYPE_V8SF_INT:
   12611         2422 :     case V4SF_FTYPE_V16SF_INT:
   12612         2422 :     case V16SF_FTYPE_V16SF_INT:
   12613         2422 :     case V4SI_FTYPE_V4SI_INT:
   12614         2422 :     case V4SI_FTYPE_V8SI_INT:
   12615         2422 :     case V4HI_FTYPE_V4HI_INT:
   12616         2422 :     case V4DF_FTYPE_V4DF_INT:
   12617         2422 :     case V4DF_FTYPE_V8DF_INT:
   12618         2422 :     case V4SF_FTYPE_V4SF_INT:
   12619         2422 :     case V4SF_FTYPE_V8SF_INT:
   12620         2422 :     case V2DI_FTYPE_V2DI_INT:
   12621         2422 :     case V2DF_FTYPE_V2DF_INT:
   12622         2422 :     case V2DF_FTYPE_V4DF_INT:
   12623         2422 :     case V16HI_FTYPE_V16HI_INT:
   12624         2422 :     case V8SI_FTYPE_V8SI_INT:
   12625         2422 :     case V16SI_FTYPE_V16SI_INT:
   12626         2422 :     case V4SI_FTYPE_V16SI_INT:
   12627         2422 :     case V4DI_FTYPE_V4DI_INT:
   12628         2422 :     case V2DI_FTYPE_V4DI_INT:
   12629         2422 :     case V4DI_FTYPE_V8DI_INT:
   12630         2422 :     case UQI_FTYPE_UQI_UQI_CONST:
   12631         2422 :     case UHI_FTYPE_UHI_UQI:
   12632         2422 :     case USI_FTYPE_USI_UQI:
   12633         2422 :     case UDI_FTYPE_UDI_UQI:
   12634         2422 :       nargs = 2;
   12635         2422 :       nargs_constant = 1;
   12636         2422 :       break;
   12637        18718 :     case V16QI_FTYPE_V16QI_V16QI_V16QI:
   12638        18718 :     case V8SF_FTYPE_V8SF_V8SF_V8SF:
   12639        18718 :     case V4DF_FTYPE_V4DF_V4DF_V4DF:
   12640        18718 :     case V4SF_FTYPE_V4SF_V4SF_V4SF:
   12641        18718 :     case V2DF_FTYPE_V2DF_V2DF_V2DF:
   12642        18718 :     case V32QI_FTYPE_V32QI_V32QI_V32QI:
   12643        18718 :     case UHI_FTYPE_V16SI_V16SI_UHI:
   12644        18718 :     case UQI_FTYPE_V8DI_V8DI_UQI:
   12645        18718 :     case V16HI_FTYPE_V16SI_V16HI_UHI:
   12646        18718 :     case V16QI_FTYPE_V16SI_V16QI_UHI:
   12647        18718 :     case V16QI_FTYPE_V8DI_V16QI_UQI:
   12648        18718 :     case V32HF_FTYPE_V32HF_V32HF_USI:
   12649        18718 :     case V16SF_FTYPE_V16SF_V16SF_UHI:
   12650        18718 :     case V16SF_FTYPE_V4SF_V16SF_UHI:
   12651        18718 :     case V16SI_FTYPE_SI_V16SI_UHI:
   12652        18718 :     case V16SI_FTYPE_V16HI_V16SI_UHI:
   12653        18718 :     case V16SI_FTYPE_V16QI_V16SI_UHI:
   12654        18718 :     case V8SF_FTYPE_V4SF_V8SF_UQI:
   12655        18718 :     case V4DF_FTYPE_V2DF_V4DF_UQI:
   12656        18718 :     case V8SI_FTYPE_V4SI_V8SI_UQI:
   12657        18718 :     case V8SI_FTYPE_SI_V8SI_UQI:
   12658        18718 :     case V4SI_FTYPE_V4SI_V4SI_UQI:
   12659        18718 :     case V4SI_FTYPE_SI_V4SI_UQI:
   12660        18718 :     case V4DI_FTYPE_V2DI_V4DI_UQI:
   12661        18718 :     case V4DI_FTYPE_DI_V4DI_UQI:
   12662        18718 :     case V2DI_FTYPE_V2DI_V2DI_UQI:
   12663        18718 :     case V2DI_FTYPE_DI_V2DI_UQI:
   12664        18718 :     case V64QI_FTYPE_V64QI_V64QI_UDI:
   12665        18718 :     case V64QI_FTYPE_V16QI_V64QI_UDI:
   12666        18718 :     case V64QI_FTYPE_QI_V64QI_UDI:
   12667        18718 :     case V32QI_FTYPE_V32QI_V32QI_USI:
   12668        18718 :     case V32QI_FTYPE_V16QI_V32QI_USI:
   12669        18718 :     case V32QI_FTYPE_QI_V32QI_USI:
   12670        18718 :     case V16QI_FTYPE_V16QI_V16QI_UHI:
   12671        18718 :     case V16QI_FTYPE_QI_V16QI_UHI:
   12672        18718 :     case V32HI_FTYPE_V8HI_V32HI_USI:
   12673        18718 :     case V32HI_FTYPE_V32BF_V32HI_USI:
   12674        18718 :     case V32HI_FTYPE_HI_V32HI_USI:
   12675        18718 :     case V16HI_FTYPE_V8HI_V16HI_UHI:
   12676        18718 :     case V16HI_FTYPE_V16BF_V16HI_UHI:
   12677        18718 :     case V16HI_FTYPE_HI_V16HI_UHI:
   12678        18718 :     case V8HI_FTYPE_V8HI_V8HI_UQI:
   12679        18718 :     case V8HI_FTYPE_V8BF_V8HI_UQI:
   12680        18718 :     case V8BF_FTYPE_V8BF_V8BF_UQI:
   12681        18718 :     case V8HI_FTYPE_HI_V8HI_UQI:
   12682        18718 :     case V16HF_FTYPE_V16HF_V16HF_UHI:
   12683        18718 :     case V8SF_FTYPE_V8HI_V8SF_UQI:
   12684        18718 :     case V4SF_FTYPE_V8HI_V4SF_UQI:
   12685        18718 :     case V8SI_FTYPE_V8HF_V8SI_UQI:
   12686        18718 :     case V8SF_FTYPE_V8HF_V8SF_UQI:
   12687        18718 :     case V8SI_FTYPE_V8SF_V8SI_UQI:
   12688        18718 :     case V4SI_FTYPE_V4SF_V4SI_UQI:
   12689        18718 :     case V4SI_FTYPE_V8HF_V4SI_UQI:
   12690        18718 :     case V4SF_FTYPE_V8HF_V4SF_UQI:
   12691        18718 :     case V4DI_FTYPE_V8HF_V4DI_UQI:
   12692        18718 :     case V4DI_FTYPE_V4SF_V4DI_UQI:
   12693        18718 :     case V2DI_FTYPE_V8HF_V2DI_UQI:
   12694        18718 :     case V2DI_FTYPE_V4SF_V2DI_UQI:
   12695        18718 :     case V8HF_FTYPE_V8HF_V8HF_UQI:
   12696        18718 :     case V8HF_FTYPE_V8HF_V8HF_V8HF:
   12697        18718 :     case V8HF_FTYPE_V8HI_V8HF_UQI:
   12698        18718 :     case V8HF_FTYPE_V8SI_V8HF_UQI:
   12699        18718 :     case V8HF_FTYPE_V8SF_V8HF_UQI:
   12700        18718 :     case V8HF_FTYPE_V4SI_V8HF_UQI:
   12701        18718 :     case V8HF_FTYPE_V4SF_V8HF_UQI:
   12702        18718 :     case V8HF_FTYPE_V4DI_V8HF_UQI:
   12703        18718 :     case V8HF_FTYPE_V4DF_V8HF_UQI:
   12704        18718 :     case V8HF_FTYPE_V2DI_V8HF_UQI:
   12705        18718 :     case V8HF_FTYPE_V2DF_V8HF_UQI:
   12706        18718 :     case V4SF_FTYPE_V4DI_V4SF_UQI:
   12707        18718 :     case V4SF_FTYPE_V2DI_V4SF_UQI:
   12708        18718 :     case V4DF_FTYPE_V4DI_V4DF_UQI:
   12709        18718 :     case V4DF_FTYPE_V8HF_V4DF_UQI:
   12710        18718 :     case V2DF_FTYPE_V8HF_V2DF_UQI:
   12711        18718 :     case V2DF_FTYPE_V2DI_V2DF_UQI:
   12712        18718 :     case V16QI_FTYPE_V8HI_V16QI_UQI:
   12713        18718 :     case V16QI_FTYPE_V16HI_V16QI_UHI:
   12714        18718 :     case V16QI_FTYPE_V4SI_V16QI_UQI:
   12715        18718 :     case V16QI_FTYPE_V8SI_V16QI_UQI:
   12716        18718 :     case V8HI_FTYPE_V8HF_V8HI_UQI:
   12717        18718 :     case V8HI_FTYPE_V4SI_V8HI_UQI:
   12718        18718 :     case V8HI_FTYPE_V8SI_V8HI_UQI:
   12719        18718 :     case V16QI_FTYPE_V2DI_V16QI_UQI:
   12720        18718 :     case V16QI_FTYPE_V4DI_V16QI_UQI:
   12721        18718 :     case V8HI_FTYPE_V2DI_V8HI_UQI:
   12722        18718 :     case V8HI_FTYPE_V4DI_V8HI_UQI:
   12723        18718 :     case V4SI_FTYPE_V2DI_V4SI_UQI:
   12724        18718 :     case V4SI_FTYPE_V4DI_V4SI_UQI:
   12725        18718 :     case V32QI_FTYPE_V32HI_V32QI_USI:
   12726        18718 :     case UHI_FTYPE_V16QI_V16QI_UHI:
   12727        18718 :     case USI_FTYPE_V32QI_V32QI_USI:
   12728        18718 :     case UDI_FTYPE_V64QI_V64QI_UDI:
   12729        18718 :     case UQI_FTYPE_V8HI_V8HI_UQI:
   12730        18718 :     case UHI_FTYPE_V16HI_V16HI_UHI:
   12731        18718 :     case USI_FTYPE_V32HI_V32HI_USI:
   12732        18718 :     case UQI_FTYPE_V4SI_V4SI_UQI:
   12733        18718 :     case UQI_FTYPE_V8SI_V8SI_UQI:
   12734        18718 :     case UQI_FTYPE_V2DI_V2DI_UQI:
   12735        18718 :     case UQI_FTYPE_V4DI_V4DI_UQI:
   12736        18718 :     case V4SF_FTYPE_V2DF_V4SF_UQI:
   12737        18718 :     case V4SF_FTYPE_V4DF_V4SF_UQI:
   12738        18718 :     case V16SI_FTYPE_V16SI_V16SI_UHI:
   12739        18718 :     case V16SI_FTYPE_V4SI_V16SI_UHI:
   12740        18718 :     case V2DI_FTYPE_V4SI_V2DI_UQI:
   12741        18718 :     case V2DI_FTYPE_V8HI_V2DI_UQI:
   12742        18718 :     case V2DI_FTYPE_V16QI_V2DI_UQI:
   12743        18718 :     case V4DI_FTYPE_V4DI_V4DI_UQI:
   12744        18718 :     case V4DI_FTYPE_V4SI_V4DI_UQI:
   12745        18718 :     case V4DI_FTYPE_V8HI_V4DI_UQI:
   12746        18718 :     case V4DI_FTYPE_V16QI_V4DI_UQI:
   12747        18718 :     case V4DI_FTYPE_V4DF_V4DI_UQI:
   12748        18718 :     case V2DI_FTYPE_V2DF_V2DI_UQI:
   12749        18718 :     case V4SI_FTYPE_V4DF_V4SI_UQI:
   12750        18718 :     case V4SI_FTYPE_V2DF_V4SI_UQI:
   12751        18718 :     case V4SI_FTYPE_V8HI_V4SI_UQI:
   12752        18718 :     case V4SI_FTYPE_V16QI_V4SI_UQI:
   12753        18718 :     case V4DI_FTYPE_V4DI_V4DI_V4DI:
   12754        18718 :     case V8DF_FTYPE_V2DF_V8DF_UQI:
   12755        18718 :     case V8DF_FTYPE_V4DF_V8DF_UQI:
   12756        18718 :     case V8DF_FTYPE_V8DF_V8DF_UQI:
   12757        18718 :     case V8SF_FTYPE_V8SF_V8SF_UQI:
   12758        18718 :     case V8SF_FTYPE_V8SI_V8SF_UQI:
   12759        18718 :     case V4DF_FTYPE_V4DF_V4DF_UQI:
   12760        18718 :     case V4SF_FTYPE_V4SF_V4SF_UQI:
   12761        18718 :     case V2DF_FTYPE_V2DF_V2DF_UQI:
   12762        18718 :     case V2DF_FTYPE_V4SF_V2DF_UQI:
   12763        18718 :     case V2DF_FTYPE_V4SI_V2DF_UQI:
   12764        18718 :     case V4SF_FTYPE_V4SI_V4SF_UQI:
   12765        18718 :     case V4DF_FTYPE_V4SF_V4DF_UQI:
   12766        18718 :     case V4DF_FTYPE_V4SI_V4DF_UQI:
   12767        18718 :     case V8SI_FTYPE_V8SI_V8SI_UQI:
   12768        18718 :     case V8SI_FTYPE_V8HI_V8SI_UQI:
   12769        18718 :     case V8SI_FTYPE_V16QI_V8SI_UQI:
   12770        18718 :     case V8DF_FTYPE_V8SI_V8DF_UQI:
   12771        18718 :     case V8DI_FTYPE_DI_V8DI_UQI:
   12772        18718 :     case V16SF_FTYPE_V8SF_V16SF_UHI:
   12773        18718 :     case V16SI_FTYPE_V8SI_V16SI_UHI:
   12774        18718 :     case V16HF_FTYPE_V16HI_V16HF_UHI:
   12775        18718 :     case V16HF_FTYPE_V16HF_V16HF_V16HF:
   12776        18718 :     case V16HI_FTYPE_V16HF_V16HI_UHI:
   12777        18718 :     case V16HI_FTYPE_V16HI_V16HI_UHI:
   12778        18718 :     case V16BF_FTYPE_V16BF_V16BF_UHI:
   12779        18718 :     case V8HI_FTYPE_V16QI_V8HI_UQI:
   12780        18718 :     case V16HI_FTYPE_V16QI_V16HI_UHI:
   12781        18718 :     case V32HI_FTYPE_V32HI_V32HI_USI:
   12782        18718 :     case V32BF_FTYPE_V32BF_V32BF_USI:
   12783        18718 :     case V32HI_FTYPE_V32QI_V32HI_USI:
   12784        18718 :     case V8DI_FTYPE_V16QI_V8DI_UQI:
   12785        18718 :     case V8DI_FTYPE_V2DI_V8DI_UQI:
   12786        18718 :     case V8DI_FTYPE_V4DI_V8DI_UQI:
   12787        18718 :     case V8DI_FTYPE_V8DI_V8DI_UQI:
   12788        18718 :     case V8DI_FTYPE_V8HI_V8DI_UQI:
   12789        18718 :     case V8DI_FTYPE_V8SI_V8DI_UQI:
   12790        18718 :     case V8HI_FTYPE_V8DI_V8HI_UQI:
   12791        18718 :     case V8SI_FTYPE_V8DI_V8SI_UQI:
   12792        18718 :     case V4SI_FTYPE_V4SI_V4SI_V4SI:
   12793        18718 :     case V4DI_FTYPE_V4DI_V4DI_V2DI:
   12794        18718 :     case V16SI_FTYPE_V16SI_V16SI_V16SI:
   12795        18718 :     case V8DI_FTYPE_V8DI_V8DI_V8DI:
   12796        18718 :     case V32HI_FTYPE_V32HI_V32HI_V32HI:
   12797        18718 :     case V2DI_FTYPE_V2DI_V2DI_V2DI:
   12798        18718 :     case V16HI_FTYPE_V16HI_V16HI_V16HI:
   12799        18718 :     case V8SI_FTYPE_V8SI_V8SI_V8SI:
   12800        18718 :     case V8HI_FTYPE_V8HI_V8HI_V8HI:
   12801        18718 :     case V32BF_FTYPE_V16SF_V16SF_USI:
   12802        18718 :     case V16BF_FTYPE_V8SF_V8SF_UHI:
   12803        18718 :     case V8BF_FTYPE_V4SF_V4SF_UQI:
   12804        18718 :     case V16BF_FTYPE_V16SF_V16BF_UHI:
   12805        18718 :     case V8BF_FTYPE_V8SF_V8BF_UQI:
   12806        18718 :     case V8BF_FTYPE_V4SF_V8BF_UQI:
   12807        18718 :     case V16SF_FTYPE_V16SF_V32BF_V32BF:
   12808        18718 :     case V8SF_FTYPE_V8SF_V16BF_V16BF:
   12809        18718 :     case V4SF_FTYPE_V4SF_V8BF_V8BF:
   12810        18718 :     case V16QI_FTYPE_V16QI_V8HF_V8HF:
   12811        18718 :     case V32QI_FTYPE_V32QI_V16HF_V16HF:
   12812        18718 :     case V64QI_FTYPE_V64QI_V32HF_V32HF:
   12813        18718 :     case V16QI_FTYPE_V8HF_V16QI_UQI:
   12814        18718 :     case V16QI_FTYPE_V16HF_V16QI_UHI:
   12815        18718 :     case V32QI_FTYPE_V32HF_V32QI_USI:
   12816        18718 :     case V8HF_FTYPE_V16QI_V8HF_UQI:
   12817        18718 :     case V16HF_FTYPE_V16QI_V16HF_UHI:
   12818        18718 :     case V32HF_FTYPE_V32QI_V32HF_USI:
   12819        18718 :     case V16SI_FTYPE_V16SF_V16SI_UHI:
   12820        18718 :     case V32HI_FTYPE_V32HF_V32HI_USI:
   12821        18718 :     case V8DI_FTYPE_V8SF_V8DI_UQI:
   12822        18718 :     case V8DI_FTYPE_V8DF_V8DI_UQI:
   12823        18718 :     case V8SI_FTYPE_V8DF_V8SI_UQI:
   12824        18718 :       nargs = 3;
   12825        18718 :       break;
   12826         1480 :     case V32QI_FTYPE_V32QI_V32QI_INT:
   12827         1480 :     case V16HI_FTYPE_V16HI_V16HI_INT:
   12828         1480 :     case V16QI_FTYPE_V16QI_V16QI_INT:
   12829         1480 :     case V4DI_FTYPE_V4DI_V4DI_INT:
   12830         1480 :     case V8HI_FTYPE_V8HI_V8HI_INT:
   12831         1480 :     case V8SI_FTYPE_V8SI_V8SI_INT:
   12832         1480 :     case V8SI_FTYPE_V8SI_V4SI_INT:
   12833         1480 :     case V8SF_FTYPE_V8SF_V8SF_INT:
   12834         1480 :     case V8SF_FTYPE_V8SF_V4SF_INT:
   12835         1480 :     case V4SI_FTYPE_V4SI_V4SI_INT:
   12836         1480 :     case V4DF_FTYPE_V4DF_V4DF_INT:
   12837         1480 :     case V16SF_FTYPE_V16SF_V16SF_INT:
   12838         1480 :     case V16SF_FTYPE_V16SF_V4SF_INT:
   12839         1480 :     case V16SI_FTYPE_V16SI_V4SI_INT:
   12840         1480 :     case V4DF_FTYPE_V4DF_V2DF_INT:
   12841         1480 :     case V4SF_FTYPE_V4SF_V4SF_INT:
   12842         1480 :     case V2DI_FTYPE_V2DI_V2DI_INT:
   12843         1480 :     case V4DI_FTYPE_V4DI_V2DI_INT:
   12844         1480 :     case V2DF_FTYPE_V2DF_V2DF_INT:
   12845         1480 :     case UQI_FTYPE_V8DI_V8UDI_INT:
   12846         1480 :     case UQI_FTYPE_V8DF_V8DF_INT:
   12847         1480 :     case UQI_FTYPE_V2DF_V2DF_INT:
   12848         1480 :     case UQI_FTYPE_V4SF_V4SF_INT:
   12849         1480 :     case UHI_FTYPE_V16SI_V16SI_INT:
   12850         1480 :     case UHI_FTYPE_V16SF_V16SF_INT:
   12851         1480 :     case V64QI_FTYPE_V64QI_V64QI_INT:
   12852         1480 :     case V32HI_FTYPE_V32HI_V32HI_INT:
   12853         1480 :     case V16SI_FTYPE_V16SI_V16SI_INT:
   12854         1480 :     case V8DI_FTYPE_V8DI_V8DI_INT:
   12855         1480 :       nargs = 3;
   12856         1480 :       nargs_constant = 1;
   12857         1480 :       break;
   12858           47 :     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
   12859           47 :       nargs = 3;
   12860           47 :       rmode = V4DImode;
   12861           47 :       nargs_constant = 1;
   12862           47 :       break;
   12863           80 :     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
   12864           80 :       nargs = 3;
   12865           80 :       rmode = V2DImode;
   12866           80 :       nargs_constant = 1;
   12867           80 :       break;
   12868           48 :     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
   12869           48 :       nargs = 3;
   12870           48 :       rmode = DImode;
   12871           48 :       nargs_constant = 1;
   12872           48 :       break;
   12873           20 :     case V2DI_FTYPE_V2DI_UINT_UINT:
   12874           20 :       nargs = 3;
   12875           20 :       nargs_constant = 2;
   12876           20 :       break;
   12877            8 :     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
   12878            8 :       nargs = 3;
   12879            8 :       rmode = V8DImode;
   12880            8 :       nargs_constant = 1;
   12881            8 :       break;
   12882           16 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
   12883           16 :       nargs = 5;
   12884           16 :       rmode = V8DImode;
   12885           16 :       mask_pos = 2;
   12886           16 :       nargs_constant = 1;
   12887           16 :       break;
   12888          320 :     case QI_FTYPE_V8DF_INT_UQI:
   12889          320 :     case QI_FTYPE_V4DF_INT_UQI:
   12890          320 :     case QI_FTYPE_V2DF_INT_UQI:
   12891          320 :     case HI_FTYPE_V16SF_INT_UHI:
   12892          320 :     case QI_FTYPE_V8SF_INT_UQI:
   12893          320 :     case QI_FTYPE_V4SF_INT_UQI:
   12894          320 :     case QI_FTYPE_V8HF_INT_UQI:
   12895          320 :     case HI_FTYPE_V16HF_INT_UHI:
   12896          320 :     case SI_FTYPE_V32HF_INT_USI:
   12897          320 :     case QI_FTYPE_V8BF_INT_UQI:
   12898          320 :     case HI_FTYPE_V16BF_INT_UHI:
   12899          320 :     case SI_FTYPE_V32BF_INT_USI:
   12900          320 :     case V4SI_FTYPE_V4SI_V4SI_UHI:
   12901          320 :     case V8SI_FTYPE_V8SI_V8SI_UHI:
   12902          320 :       nargs = 3;
   12903          320 :       mask_pos = 1;
   12904          320 :       nargs_constant = 1;
   12905          320 :       break;
   12906           17 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
   12907           17 :       nargs = 5;
   12908           17 :       rmode = V4DImode;
   12909           17 :       mask_pos = 2;
   12910           17 :       nargs_constant = 1;
   12911           17 :       break;
   12912           17 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
   12913           17 :       nargs = 5;
   12914           17 :       rmode = V2DImode;
   12915           17 :       mask_pos = 2;
   12916           17 :       nargs_constant = 1;
   12917           17 :       break;
   12918        17266 :     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
   12919        17266 :     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
   12920        17266 :     case V32BF_FTYPE_V32BF_V32BF_V32BF_USI:
   12921        17266 :     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
   12922        17266 :     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
   12923        17266 :     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
   12924        17266 :     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
   12925        17266 :     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
   12926        17266 :     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
   12927        17266 :     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
   12928        17266 :     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
   12929        17266 :     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
   12930        17266 :     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
   12931        17266 :     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
   12932        17266 :     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
   12933        17266 :     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
   12934        17266 :     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
   12935        17266 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
   12936        17266 :     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
   12937        17266 :     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
   12938        17266 :     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
   12939        17266 :     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
   12940        17266 :     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
   12941        17266 :     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
   12942        17266 :     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
   12943        17266 :     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
   12944        17266 :     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
   12945        17266 :     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
   12946        17266 :     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
   12947        17266 :     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
   12948        17266 :     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
   12949        17266 :     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
   12950        17266 :     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
   12951        17266 :     case V8BF_FTYPE_V8BF_V8BF_V8BF_UQI:
   12952        17266 :     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
   12953        17266 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
   12954        17266 :     case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
   12955        17266 :     case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
   12956        17266 :     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
   12957        17266 :     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
   12958        17266 :     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
   12959        17266 :     case V16BF_FTYPE_V16BF_V16BF_V16BF_UHI:
   12960        17266 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
   12961        17266 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
   12962        17266 :     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
   12963        17266 :     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
   12964        17266 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
   12965        17266 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
   12966        17266 :     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
   12967        17266 :     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
   12968        17266 :     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
   12969        17266 :     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
   12970        17266 :     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
   12971        17266 :     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
   12972        17266 :     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
   12973        17266 :     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
   12974        17266 :     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
   12975        17266 :     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
   12976        17266 :     case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
   12977        17266 :     case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
   12978        17266 :     case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
   12979        17266 :     case V32HF_FTYPE_V16SF_V16SF_V32HF_USI:
   12980        17266 :     case V16HF_FTYPE_V8SF_V8SF_V16HF_UHI:
   12981        17266 :     case V8HF_FTYPE_V4SF_V4SF_V8HF_UQI:
   12982        17266 :     case V16QI_FTYPE_V8HF_V8HF_V16QI_UHI:
   12983        17266 :     case V32QI_FTYPE_V16HF_V16HF_V32QI_USI:
   12984        17266 :     case V64QI_FTYPE_V32HF_V32HF_V64QI_UDI:
   12985        17266 :     case V16QI_FTYPE_V16QI_V8HF_V16QI_UHI:
   12986        17266 :     case V16QI_FTYPE_V32QI_V16HF_V16QI_UHI:
   12987        17266 :     case V32QI_FTYPE_V64QI_V32HF_V32QI_USI:
   12988        17266 :       nargs = 4;
   12989        17266 :       break;
   12990           11 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
   12991           11 :     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
   12992           11 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
   12993           11 :     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
   12994           11 :     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
   12995           11 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
   12996           11 :       nargs = 4;
   12997           11 :       nargs_constant = 1;
   12998           11 :       break;
   12999         3718 :     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
   13000         3718 :     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
   13001         3718 :     case QI_FTYPE_V4DF_V4DF_INT_UQI:
   13002         3718 :     case QI_FTYPE_V8SF_V8SF_INT_UQI:
   13003         3718 :     case UHI_FTYPE_V16HF_V16HF_INT_UHI:
   13004         3718 :     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
   13005         3718 :     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
   13006         3718 :     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
   13007         3718 :     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
   13008         3718 :     case UQI_FTYPE_V8HF_V8HF_INT_UQI:
   13009         3718 :     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
   13010         3718 :     case USI_FTYPE_V32QI_V32QI_INT_USI:
   13011         3718 :     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
   13012         3718 :     case USI_FTYPE_V32HI_V32HI_INT_USI:
   13013         3718 :     case USI_FTYPE_V32BF_V32BF_INT_USI:
   13014         3718 :     case USI_FTYPE_V32HF_V32HF_INT_USI:
   13015         3718 :     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
   13016         3718 :     case UHI_FTYPE_V16BF_V16BF_INT_UHI:
   13017         3718 :     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
   13018         3718 :     case UQI_FTYPE_V8BF_V8BF_INT_UQI:
   13019         3718 :       nargs = 4;
   13020         3718 :       mask_pos = 1;
   13021         3718 :       nargs_constant = 1;
   13022         3718 :       break;
   13023           23 :     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
   13024           23 :       nargs = 4;
   13025           23 :       nargs_constant = 2;
   13026           23 :       break;
   13027           67 :     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
   13028           67 :     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
   13029           67 :     case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
   13030           67 :     case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
   13031           67 :     case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
   13032           67 :       nargs = 4;
   13033           67 :       break;
   13034          679 :     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
   13035          679 :     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
   13036          679 :       mask_pos = 1;
   13037          679 :       nargs = 4;
   13038          679 :       nargs_constant = 1;
   13039          679 :       break;
   13040         3948 :     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
   13041         3948 :     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
   13042         3948 :     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
   13043         3948 :     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
   13044         3948 :     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
   13045         3948 :     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
   13046         3948 :     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
   13047         3948 :     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
   13048         3948 :     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
   13049         3948 :     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
   13050         3948 :     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
   13051         3948 :     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
   13052         3948 :     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
   13053         3948 :     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
   13054         3948 :     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
   13055         3948 :     case V32BF_FTYPE_V32BF_INT_V32BF_USI:
   13056         3948 :     case V16BF_FTYPE_V16BF_INT_V16BF_UHI:
   13057         3948 :     case V8BF_FTYPE_V8BF_INT_V8BF_UQI:
   13058         3948 :     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
   13059         3948 :     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
   13060         3948 :     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
   13061         3948 :     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
   13062         3948 :     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
   13063         3948 :     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
   13064         3948 :     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
   13065         3948 :     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
   13066         3948 :     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
   13067         3948 :     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
   13068         3948 :     case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
   13069         3948 :     case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
   13070         3948 :     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
   13071         3948 :     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
   13072         3948 :     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
   13073         3948 :     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
   13074         3948 :     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
   13075         3948 :       nargs = 4;
   13076         3948 :       mask_pos = 2;
   13077         3948 :       nargs_constant = 1;
   13078         3948 :       break;
   13079         1726 :     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
   13080         1726 :     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
   13081         1726 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
   13082         1726 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
   13083         1726 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
   13084         1726 :     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
   13085         1726 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
   13086         1726 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
   13087         1726 :     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
   13088         1726 :     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
   13089         1726 :     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
   13090         1726 :     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
   13091         1726 :     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
   13092         1726 :     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
   13093         1726 :     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
   13094         1726 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
   13095         1726 :     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
   13096         1726 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
   13097         1726 :     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
   13098         1726 :     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
   13099         1726 :     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
   13100         1726 :     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
   13101         1726 :     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
   13102         1726 :     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
   13103         1726 :     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
   13104         1726 :     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
   13105         1726 :     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
   13106         1726 :       nargs = 5;
   13107         1726 :       mask_pos = 2;
   13108         1726 :       nargs_constant = 1;
   13109         1726 :       break;
   13110          268 :     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
   13111          268 :     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
   13112          268 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
   13113          268 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
   13114          268 :     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
   13115          268 :     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
   13116          268 :     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
   13117          268 :     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
   13118          268 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
   13119          268 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
   13120          268 :       nargs = 5;
   13121          268 :       mask_pos = 1;
   13122          268 :       nargs_constant = 1;
   13123          268 :       break;
   13124          732 :     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
   13125          732 :     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
   13126          732 :     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
   13127          732 :     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
   13128          732 :     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
   13129          732 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
   13130          732 :     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
   13131          732 :     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
   13132          732 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
   13133          732 :     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
   13134          732 :     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
   13135          732 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
   13136          732 :     case V8BF_FTYPE_V8BF_V8BF_INT_V8BF_UQI:
   13137          732 :     case V16BF_FTYPE_V16BF_V16BF_INT_V16BF_UHI:
   13138          732 :     case V32BF_FTYPE_V32BF_V32BF_INT_V32BF_USI:
   13139          732 :     case V16HF_FTYPE_V16HF_V16HF_INT_V16HF_UHI:
   13140          732 :     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI:
   13141          732 :       nargs = 5;
   13142          732 :       mask_pos = 1;
   13143          732 :       nargs_constant = 2;
   13144          732 :       break;
   13145              : 
   13146            0 :     default:
   13147            0 :       gcc_unreachable ();
   13148              :     }
   13149              : 
   13150        56356 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   13151              : 
   13152        61682 :   if (comparison != UNKNOWN)
   13153              :     {
   13154          614 :       gcc_assert (nargs == 2);
   13155          614 :       return ix86_expand_sse_compare (d, exp, target, swap);
   13156              :     }
   13157              : 
   13158        61068 :   if (rmode == VOIDmode || rmode == tmode)
   13159              :     {
   13160        60883 :       if (optimize
   13161        17729 :           || target == 0
   13162        17729 :           || GET_MODE (target) != tmode
   13163        78410 :           || !insn_p->operand[0].predicate (target, tmode))
   13164        43444 :         target = gen_reg_rtx (tmode);
   13165        17439 :       else if (memory_operand (target, tmode))
   13166          578 :         num_memory++;
   13167              :       real_target = target;
   13168              :     }
   13169              :   else
   13170              :     {
   13171          185 :       real_target = gen_reg_rtx (tmode);
   13172          185 :       target = lowpart_subreg (rmode, real_target, tmode);
   13173              :     }
   13174              : 
   13175       261380 :   for (i = 0; i < nargs; i++)
   13176              :     {
   13177       200545 :       tree arg = CALL_EXPR_ARG (exp, i);
   13178       200545 :       rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
   13179       200545 :       machine_mode mode = insn_p->operand[i + 1].mode;
   13180              :       /* Need to fixup modeless constant before testing predicate.  */
   13181       200545 :       op = fixup_modeless_constant (op, mode);
   13182       200545 :       bool match = insn_p->operand[i + 1].predicate (op, mode);
   13183              : 
   13184       200545 :       if (second_arg_count && i == 1)
   13185              :         {
   13186              :           /* SIMD shift insns take either an 8-bit immediate or
   13187              :              register as count.  But builtin functions take int as
   13188              :              count.  If count doesn't match, we put it in register.
   13189              :              The instructions are using 64-bit count, if op is just
   13190              :              32-bit, zero-extend it, as negative shift counts
   13191              :              are undefined behavior and zero-extension is more
   13192              :              efficient.  */
   13193         2889 :           if (!match)
   13194              :             {
   13195         1750 :               if (SCALAR_INT_MODE_P (GET_MODE (op)))
   13196          489 :                 op = convert_modes (mode, GET_MODE (op), op, 1);
   13197              :               else
   13198         1261 :                 op = lowpart_subreg (mode, op, GET_MODE (op));
   13199         1750 :               if (!insn_p->operand[i + 1].predicate (op, mode))
   13200          190 :                 op = copy_to_reg (op);
   13201              :             }
   13202              :         }
   13203       197656 :       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
   13204       149600 :                (!mask_pos && (nargs - i) <= nargs_constant))
   13205              :         {
   13206        16488 :           if (!match)
   13207          233 :             switch (icode)
   13208              :               {
   13209            2 :               case CODE_FOR_avx_vinsertf128v4di:
   13210            2 :               case CODE_FOR_avx_vextractf128v4di:
   13211            2 :                 error ("the last argument must be an 1-bit immediate");
   13212            2 :                 return const0_rtx;
   13213              : 
   13214            8 :               case CODE_FOR_avx512f_cmpv8di3_mask:
   13215            8 :               case CODE_FOR_avx512f_cmpv16si3_mask:
   13216            8 :               case CODE_FOR_avx512f_ucmpv8di3_mask:
   13217            8 :               case CODE_FOR_avx512f_ucmpv16si3_mask:
   13218            8 :               case CODE_FOR_avx512vl_cmpv4di3_mask:
   13219            8 :               case CODE_FOR_avx512vl_cmpv8si3_mask:
   13220            8 :               case CODE_FOR_avx512vl_ucmpv4di3_mask:
   13221            8 :               case CODE_FOR_avx512vl_ucmpv8si3_mask:
   13222            8 :               case CODE_FOR_avx512vl_cmpv2di3_mask:
   13223            8 :               case CODE_FOR_avx512vl_cmpv4si3_mask:
   13224            8 :               case CODE_FOR_avx512vl_ucmpv2di3_mask:
   13225            8 :               case CODE_FOR_avx512vl_ucmpv4si3_mask:
   13226            8 :                 error ("the last argument must be a 3-bit immediate");
   13227            8 :                 return const0_rtx;
   13228              : 
   13229           24 :               case CODE_FOR_sse4_1_roundsd:
   13230           24 :               case CODE_FOR_sse4_1_roundss:
   13231              : 
   13232           24 :               case CODE_FOR_sse4_1_roundpd:
   13233           24 :               case CODE_FOR_sse4_1_roundps:
   13234           24 :               case CODE_FOR_avx_roundpd256:
   13235           24 :               case CODE_FOR_avx_roundps256:
   13236              : 
   13237           24 :               case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
   13238           24 :               case CODE_FOR_sse4_1_roundps_sfix:
   13239           24 :               case CODE_FOR_avx_roundpd_vec_pack_sfix256:
   13240           24 :               case CODE_FOR_avx_roundps_sfix256:
   13241              : 
   13242           24 :               case CODE_FOR_sse4_1_blendps:
   13243           24 :               case CODE_FOR_avx_blendpd256:
   13244           24 :               case CODE_FOR_avx_vpermilv4df:
   13245           24 :               case CODE_FOR_avx_vpermilv4df_mask:
   13246           24 :               case CODE_FOR_avx512f_getmantv8df_mask:
   13247           24 :               case CODE_FOR_avx512f_getmantv16sf_mask:
   13248           24 :               case CODE_FOR_avx512vl_getmantv16hf_mask:
   13249           24 :               case CODE_FOR_avx512vl_getmantv8sf_mask:
   13250           24 :               case CODE_FOR_avx512vl_getmantv4df_mask:
   13251           24 :               case CODE_FOR_avx512fp16_getmantv8hf_mask:
   13252           24 :               case CODE_FOR_avx512vl_getmantv4sf_mask:
   13253           24 :               case CODE_FOR_avx512vl_getmantv2df_mask:
   13254           24 :               case CODE_FOR_avx512dq_rangepv8df_mask_round:
   13255           24 :               case CODE_FOR_avx512dq_rangepv16sf_mask_round:
   13256           24 :               case CODE_FOR_avx512dq_rangepv4df_mask:
   13257           24 :               case CODE_FOR_avx512dq_rangepv8sf_mask:
   13258           24 :               case CODE_FOR_avx512dq_rangepv2df_mask:
   13259           24 :               case CODE_FOR_avx512dq_rangepv4sf_mask:
   13260           24 :               case CODE_FOR_avx_shufpd256_mask:
   13261           24 :                 error ("the last argument must be a 4-bit immediate");
   13262           24 :                 return const0_rtx;
   13263              : 
   13264           15 :               case CODE_FOR_sha1rnds4:
   13265           15 :               case CODE_FOR_sse4_1_blendpd:
   13266           15 :               case CODE_FOR_avx_vpermilv2df:
   13267           15 :               case CODE_FOR_avx_vpermilv2df_mask:
   13268           15 :               case CODE_FOR_xop_vpermil2v2df3:
   13269           15 :               case CODE_FOR_xop_vpermil2v4sf3:
   13270           15 :               case CODE_FOR_xop_vpermil2v4df3:
   13271           15 :               case CODE_FOR_xop_vpermil2v8sf3:
   13272           15 :               case CODE_FOR_avx512f_vinsertf32x4_mask:
   13273           15 :               case CODE_FOR_avx512f_vinserti32x4_mask:
   13274           15 :               case CODE_FOR_avx512f_vextractf32x4_mask:
   13275           15 :               case CODE_FOR_avx512f_vextracti32x4_mask:
   13276           15 :               case CODE_FOR_sse2_shufpd:
   13277           15 :               case CODE_FOR_sse2_shufpd_mask:
   13278           15 :               case CODE_FOR_avx512dq_shuf_f64x2_mask:
   13279           15 :               case CODE_FOR_avx512dq_shuf_i64x2_mask:
   13280           15 :               case CODE_FOR_avx512vl_shuf_i32x4_mask:
   13281           15 :               case CODE_FOR_avx512vl_shuf_f32x4_mask:
   13282           15 :                 error ("the last argument must be a 2-bit immediate");
   13283           15 :                 return const0_rtx;
   13284              : 
   13285           30 :               case CODE_FOR_avx_vextractf128v4df:
   13286           30 :               case CODE_FOR_avx_vextractf128v8sf:
   13287           30 :               case CODE_FOR_avx_vextractf128v8si:
   13288           30 :               case CODE_FOR_avx_vinsertf128v4df:
   13289           30 :               case CODE_FOR_avx_vinsertf128v8sf:
   13290           30 :               case CODE_FOR_avx_vinsertf128v8si:
   13291           30 :               case CODE_FOR_avx512f_vinsertf64x4_mask:
   13292           30 :               case CODE_FOR_avx512f_vinserti64x4_mask:
   13293           30 :               case CODE_FOR_avx512f_vextractf64x4_mask:
   13294           30 :               case CODE_FOR_avx512f_vextracti64x4_mask:
   13295           30 :               case CODE_FOR_avx512dq_vinsertf32x8_mask:
   13296           30 :               case CODE_FOR_avx512dq_vinserti32x8_mask:
   13297           30 :               case CODE_FOR_avx512vl_vinsertv4df:
   13298           30 :               case CODE_FOR_avx512vl_vinsertv4di:
   13299           30 :               case CODE_FOR_avx512vl_vinsertv8sf:
   13300           30 :               case CODE_FOR_avx512vl_vinsertv8si:
   13301           30 :                 error ("the last argument must be a 1-bit immediate");
   13302           30 :                 return const0_rtx;
   13303              : 
   13304           16 :               case CODE_FOR_avx_vmcmpv2df3:
   13305           16 :               case CODE_FOR_avx_vmcmpv4sf3:
   13306           16 :               case CODE_FOR_avx_cmpv2df3:
   13307           16 :               case CODE_FOR_avx_cmpv4sf3:
   13308           16 :                 if (CONST_INT_P (op) && IN_RANGE (INTVAL (op), 8, 31))
   13309              :                   {
   13310            4 :                     error ("'%s' needs isa option %s", d->name, "-mavx");
   13311            4 :                     return const0_rtx;
   13312              :                   }
   13313              :                 /* FALLTHRU */
   13314           18 :               case CODE_FOR_avx_cmpv4df3:
   13315           18 :               case CODE_FOR_avx_cmpv8sf3:
   13316           18 :               case CODE_FOR_avx512f_cmpv8df3_mask:
   13317           18 :               case CODE_FOR_avx512f_cmpv16sf3_mask:
   13318           18 :               case CODE_FOR_avx512f_vmcmpv2df3_mask:
   13319           18 :               case CODE_FOR_avx512f_vmcmpv4sf3_mask:
   13320           18 :               case CODE_FOR_avx512bw_cmpv32hf3_mask:
   13321           18 :               case CODE_FOR_avx512vl_cmpv16hf3_mask:
   13322           18 :               case CODE_FOR_avx512fp16_cmpv8hf3_mask:
   13323           18 :                 error ("the last argument must be a 5-bit immediate");
   13324           18 :                 return const0_rtx;
   13325              : 
   13326          132 :               default:
   13327          132 :                 switch (nargs_constant)
   13328              :                   {
   13329            8 :                   case 2:
   13330            8 :                     if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
   13331            8 :                         (!mask_pos && (nargs - i) == nargs_constant))
   13332              :                       {
   13333            4 :                         error ("the next to last argument must be an 8-bit immediate");
   13334            4 :                         break;
   13335              :                       }
   13336              :                     /* FALLTHRU */
   13337          128 :                   case 1:
   13338          128 :                     error ("the last argument must be an 8-bit immediate");
   13339          128 :                     break;
   13340            0 :                   default:
   13341            0 :                     gcc_unreachable ();
   13342              :                   }
   13343          132 :                 return const0_rtx;
   13344              :               }
   13345              :         }
   13346              :       else
   13347              :         {
   13348       181168 :           if (VECTOR_MODE_P (mode))
   13349       130544 :             op = safe_vector_operand (op, mode);
   13350              : 
   13351              :           /* If we aren't optimizing, only allow one memory operand to
   13352              :              be generated.  */
   13353       181168 :           if (memory_operand (op, mode))
   13354              :             {
   13355        29875 :               num_memory++;
   13356        29875 :               if (!optimize && num_memory > 1)
   13357        13613 :                 op = copy_to_mode_reg (mode, op);
   13358              :             }
   13359              : 
   13360       181168 :           if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   13361              :             {
   13362       178853 :               if (!match)
   13363        42591 :                 op = copy_to_mode_reg (mode, op);
   13364              :             }
   13365              :           else
   13366              :             {
   13367         2315 :               op = copy_to_reg (op);
   13368         2315 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   13369              :             }
   13370              :         }
   13371              : 
   13372       200312 :       xops[i] = op;
   13373              :     }
   13374              : 
   13375        60835 :   switch (nargs)
   13376              :     {
   13377         4712 :     case 1:
   13378         4712 :       pat = GEN_FCN (icode) (real_target, xops[0]);
   13379         4712 :       break;
   13380         5686 :     case 2:
   13381         5686 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
   13382         5686 :       break;
   13383        20631 :     case 3:
   13384        20631 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
   13385        20631 :       break;
   13386        27066 :     case 4:
   13387        27066 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13388        27066 :                              xops[2], xops[3]);
   13389        27066 :       break;
   13390         2740 :     case 5:
   13391         2740 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13392         2740 :                              xops[2], xops[3], xops[4]);
   13393         2740 :       break;
   13394              :     case 6:
   13395              :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13396              :                              xops[2], xops[3], xops[4], xops[5]);
   13397              :       break;
   13398              :     default:
   13399              :       gcc_unreachable ();
   13400              :     }
   13401              : 
   13402        60835 :   if (! pat)
   13403              :     return 0;
   13404              : 
   13405        60835 :   emit_insn (pat);
   13406        60835 :   return target;
   13407              : }
   13408              : 
   13409              : /* Transform pattern of following layout:
   13410              :      (set A
   13411              :        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
   13412              :      )
   13413              :    into:
   13414              :      (set (A B)) */
   13415              : 
   13416              : static rtx
   13417         4949 : ix86_erase_embedded_rounding (rtx pat)
   13418              : {
   13419         4949 :   if (NONJUMP_INSN_P (pat))
   13420          699 :     pat = PATTERN (pat);
   13421              : 
   13422         4949 :   gcc_assert (GET_CODE (pat) == SET);
   13423         4949 :   rtx src = SET_SRC (pat);
   13424         4949 :   gcc_assert (XVECLEN (src, 0) == 2);
   13425         4949 :   rtx p0 = XVECEXP (src, 0, 0);
   13426         4949 :   gcc_assert (GET_CODE (src) == UNSPEC
   13427              :               && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
   13428         4949 :   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
   13429         4949 :   return res;
   13430              : }
   13431              : 
   13432              : /* Subroutine of ix86_expand_round_builtin to take care of comi insns
   13433              :    with rounding.  */
   13434              : static rtx
   13435          103 : ix86_expand_sse_comi_round (const struct builtin_description *d,
   13436              :                             tree exp, rtx target, bool comx_ok)
   13437              : {
   13438          103 :   rtx pat, set_dst;
   13439          103 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   13440          103 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   13441          103 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   13442          103 :   tree arg3 = CALL_EXPR_ARG (exp, 3);
   13443          103 :   rtx op0 = expand_normal (arg0);
   13444          103 :   rtx op1 = expand_normal (arg1);
   13445          103 :   rtx op2 = expand_normal (arg2);
   13446          103 :   rtx op3 = expand_normal (arg3);
   13447          103 :   enum insn_code icode = d->icode;
   13448          103 :   const struct insn_data_d *insn_p = &insn_data[icode];
   13449          103 :   machine_mode mode0 = insn_p->operand[0].mode;
   13450          103 :   machine_mode mode1 = insn_p->operand[1].mode;
   13451              : 
   13452              :   /* See avxintrin.h for values.  */
   13453          103 :   static const enum rtx_code comparisons[32] =
   13454              :     {
   13455              :       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
   13456              :       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
   13457              :       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
   13458              :       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
   13459              :     };
   13460          103 :   static const bool ordereds[32] =
   13461              :     {
   13462              :       true,  true,  true,  false, false, false, false, true,
   13463              :       false, false, false, true,  true,  true,  true,  false,
   13464              :       true,  true,  true,  false, false, false, false, true,
   13465              :       false, false, false, true,  true,  true,  true,  false
   13466              :     };
   13467          103 :   static const bool non_signalings[32] =
   13468              :     {
   13469              :       true,  false, false, true,  true,  false, false, true,
   13470              :       true,  false, false, true,  true,  false, false, true,
   13471              :       false, true,  true,  false, false, true,  true,  false,
   13472              :       false, true,  true,  false, false, true,  true,  false
   13473              :     };
   13474              : 
   13475          103 :   if (!CONST_INT_P (op2))
   13476              :     {
   13477            0 :       error ("the third argument must be comparison constant");
   13478            0 :       return const0_rtx;
   13479              :     }
   13480          103 :   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
   13481              :     {
   13482            0 :       error ("incorrect comparison mode");
   13483            0 :       return const0_rtx;
   13484              :     }
   13485              : 
   13486          103 :   if (!insn_p->operand[2].predicate (op3, SImode))
   13487              :     {
   13488            4 :       error ("incorrect rounding operand");
   13489            4 :       return const0_rtx;
   13490              :     }
   13491              : 
   13492           99 :   if (VECTOR_MODE_P (mode0))
   13493           99 :     op0 = safe_vector_operand (op0, mode0);
   13494           99 :   if (VECTOR_MODE_P (mode1))
   13495           99 :     op1 = safe_vector_operand (op1, mode1);
   13496              : 
   13497           99 :   enum rtx_code comparison = comparisons[INTVAL (op2)];
   13498           99 :   enum rtx_code orig_comp = comparison;
   13499           99 :   bool ordered = ordereds[INTVAL (op2)];
   13500           99 :   bool non_signaling = non_signalings[INTVAL (op2)];
   13501           99 :   rtx const_val = const0_rtx;
   13502              : 
   13503           99 :   bool check_unordered = false;
   13504           99 :   machine_mode mode = CCFPmode;
   13505           99 :   switch (comparison)
   13506              :     {
   13507            8 :     case ORDERED:
   13508            8 :       if (!ordered)
   13509              :         {
   13510            4 :           if (TARGET_AVX10_2 && comx_ok)
   13511              :             {
   13512              :               /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
   13513              :                  differently. So directly return true here.  */
   13514            0 :               target = gen_reg_rtx (SImode);
   13515            0 :               emit_move_insn (target, const1_rtx);
   13516            0 :               return target;
   13517              :             }
   13518              :           else
   13519              :             {
   13520              :               /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
   13521              :               if (!non_signaling)
   13522           99 :                 ordered = true;
   13523           99 :               mode = CCSmode;
   13524              :             }
   13525              :         }
   13526              :       else
   13527              :         {
   13528              :           /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
   13529              :           if (non_signaling)
   13530              :             ordered = false;
   13531              :           mode = CCPmode;
   13532              :         }
   13533              :       comparison = NE;
   13534              :       break;
   13535            8 :     case UNORDERED:
   13536            8 :       if (ordered)
   13537              :         {
   13538            4 :           if (TARGET_AVX10_2 && comx_ok)
   13539              :             {
   13540              :               /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
   13541              :                  differently. So directly return false here.  */
   13542            0 :               target = gen_reg_rtx (SImode);
   13543            0 :               emit_move_insn (target, const0_rtx);
   13544            0 :               return target;
   13545              :             }
   13546              :           else
   13547              :             {
   13548              :               /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
   13549              :               if (non_signaling)
   13550           99 :                 ordered = false;
   13551              :               mode = CCSmode;
   13552              :             }
   13553              :         }
   13554              :       else
   13555              :         {
   13556              :           /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
   13557              :           if (!non_signaling)
   13558           99 :             ordered = true;
   13559           99 :           mode = CCPmode;
   13560              :         }
   13561              :       comparison = EQ;
   13562              :       break;
   13563              : 
   13564           40 :     case LE:    /* -> GE  */
   13565           40 :     case LT:    /* -> GT  */
   13566           40 :     case UNGE:  /* -> UNLE  */
   13567           40 :     case UNGT:  /* -> UNLT  */
   13568           40 :       std::swap (op0, op1);
   13569           40 :       comparison = swap_condition (comparison);
   13570              :       /* FALLTHRU */
   13571           68 :     case GT:
   13572           68 :     case GE:
   13573           68 :     case UNEQ:
   13574           68 :     case UNLT:
   13575           68 :     case UNLE:
   13576           68 :     case LTGT:
   13577              :       /* These are supported by CCFPmode.  NB: Use ordered/signaling
   13578              :          COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
   13579              :          with NAN operands.  */
   13580           68 :       if (ordered == non_signaling)
   13581              :         ordered = !ordered;
   13582              :       break;
   13583              :       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
   13584              :          _CMP_EQ_OQ/_CMP_EQ_OS.
   13585              :          Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
   13586              :          of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN.  */
   13587            8 :     case EQ:
   13588            8 :       if (!TARGET_AVX10_2 || !comx_ok)
   13589            5 :         check_unordered = true;
   13590              :       mode = CCZmode;
   13591              :       break;
   13592            7 :     case NE:
   13593              :       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
   13594              :          _CMP_NEQ_UQ/_CMP_NEQ_US.
   13595              :          Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
   13596              :          of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN.  */
   13597            7 :       gcc_assert (!ordered);
   13598            7 :       if (!TARGET_AVX10_2 || !comx_ok)
   13599            4 :         check_unordered = true;
   13600            7 :       mode = CCZmode;
   13601            7 :       const_val = const1_rtx;
   13602            7 :       break;
   13603            0 :     default:
   13604            0 :       gcc_unreachable ();
   13605              :     }
   13606              : 
   13607           99 :   target = gen_reg_rtx (SImode);
   13608           99 :   emit_move_insn (target, const_val);
   13609           99 :   target = gen_rtx_SUBREG (QImode, target, 0);
   13610              : 
   13611           93 :   if ((optimize && !register_operand (op0, mode0))
   13612          192 :       || !insn_p->operand[0].predicate (op0, mode0))
   13613            6 :     op0 = copy_to_mode_reg (mode0, op0);
   13614           93 :   if ((optimize && !register_operand (op1, mode1))
   13615          192 :       || !insn_p->operand[1].predicate (op1, mode1))
   13616            6 :     op1 = copy_to_mode_reg (mode1, op1);
   13617              : 
   13618              :     /* Generate comx instead of comi when EQ/NE to avoid NAN checks.
   13619              :        Use orig_comp to exclude ORDERED/UNORDERED cases.  */
   13620           99 :   if ((orig_comp == EQ || orig_comp == NE)
   13621           15 :       && TARGET_AVX10_2 && comx_ok)
   13622              :     {
   13623            6 :       switch (icode)
   13624              :         {
   13625              :         case CODE_FOR_avx512fp16_comi_round:
   13626           99 :           icode = CODE_FOR_avx10_2_comxhf_round;
   13627              :           break;
   13628            4 :         case CODE_FOR_sse_comi_round:
   13629            4 :           icode = CODE_FOR_avx10_2_comxsf_round;
   13630            4 :           break;
   13631            2 :         case CODE_FOR_sse2_comi_round:
   13632            2 :           icode = CODE_FOR_avx10_2_comxdf_round;
   13633            2 :           break;
   13634              : 
   13635              :         default:
   13636              :           break;
   13637              :         }
   13638              :     }
   13639              : 
   13640              :   /* Generate comi instead of comx when UNEQ/LTGT to avoid NAN checks.  */
   13641           99 :   if ((comparison == UNEQ || comparison == LTGT)
   13642            8 :        && TARGET_AVX10_2 && comx_ok)
   13643              :     {
   13644            0 :       switch (icode)
   13645              :         {
   13646              :         case CODE_FOR_avx10_2_comxhf_round:
   13647           99 :           icode = CODE_FOR_avx512fp16_comi_round;
   13648              :           break;
   13649            0 :         case CODE_FOR_avx10_2_comxsf_round:
   13650            0 :           icode = CODE_FOR_sse_comi_round;
   13651            0 :           break;
   13652            0 :         case CODE_FOR_avx10_2_comxdf_round:
   13653            0 :           icode = CODE_FOR_sse2_comi_round;
   13654            0 :           break;
   13655              : 
   13656              :         default:
   13657              :           break;
   13658              :         }
   13659              :     }
   13660              : 
   13661              :   /*
   13662              :      1. COMI/VCOMX: ordered and signaling.
   13663              :      2. UCOMI/VUCOMX: unordered and non-signaling.
   13664              :    */
   13665           99 :   if (non_signaling)
   13666           38 :     switch (icode)
   13667              :       {
   13668              :       case CODE_FOR_sse_comi_round:
   13669              :         icode = CODE_FOR_sse_ucomi_round;
   13670              :         break;
   13671           17 :       case CODE_FOR_sse2_comi_round:
   13672           17 :         icode = CODE_FOR_sse2_ucomi_round;
   13673           17 :         break;
   13674            0 :       case CODE_FOR_avx512fp16_comi_round:
   13675            0 :         icode = CODE_FOR_avx512fp16_ucomi_round;
   13676            0 :         break;
   13677            3 :       case CODE_FOR_avx10_2_comxsf_round:
   13678            3 :         icode = CODE_FOR_avx10_2_ucomxsf_round;
   13679            3 :         break;
   13680            0 :       case CODE_FOR_avx10_2_comxhf_round:
   13681            0 :         icode = CODE_FOR_avx10_2_ucomxhf_round;
   13682            0 :         break;
   13683            1 :       case CODE_FOR_avx10_2_comxdf_round:
   13684            1 :         icode = CODE_FOR_avx10_2_ucomxdf_round;
   13685            1 :         break;
   13686            0 :       default:
   13687            0 :         gcc_unreachable ();
   13688              :       }
   13689              : 
   13690           99 :   pat = GEN_FCN (icode) (op0, op1, op3);
   13691           99 :   if (! pat)
   13692              :     return 0;
   13693              : 
   13694              :   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
   13695           99 :   if (INTVAL (op3) == NO_ROUND)
   13696              :     {
   13697            1 :       pat = ix86_erase_embedded_rounding (pat);
   13698            1 :       if (! pat)
   13699              :         return 0;
   13700              : 
   13701            1 :       set_dst = SET_DEST (pat);
   13702              :     }
   13703              :   else
   13704              :     {
   13705           98 :       gcc_assert (GET_CODE (pat) == SET);
   13706           98 :       set_dst = SET_DEST (pat);
   13707              :     }
   13708              : 
   13709           99 :   emit_insn (pat);
   13710              : 
   13711           99 :   return ix86_ssecom_setcc (comparison, check_unordered, mode,
   13712           99 :                             set_dst, target);
   13713              : }
   13714              : 
   13715              : static rtx
   13716        15604 : ix86_expand_round_builtin (const struct builtin_description *d,
   13717              :                            tree exp, rtx target)
   13718              : {
   13719        15604 :   rtx pat;
   13720        15604 :   unsigned int i, nargs;
   13721        15604 :   rtx xops[6];
   13722        15604 :   enum insn_code icode = d->icode;
   13723        15604 :   const struct insn_data_d *insn_p = &insn_data[icode];
   13724        15604 :   machine_mode tmode = insn_p->operand[0].mode;
   13725        15604 :   unsigned int nargs_constant = 0;
   13726        15604 :   unsigned int redundant_embed_rnd = 0;
   13727              : 
   13728        15604 :   switch ((enum ix86_builtin_func_type) d->flag)
   13729              :     {
   13730              :     case UINT64_FTYPE_V2DF_INT:
   13731              :     case UINT64_FTYPE_V4SF_INT:
   13732              :     case UINT64_FTYPE_V8HF_INT:
   13733              :     case UINT_FTYPE_V2DF_INT:
   13734              :     case UINT_FTYPE_V4SF_INT:
   13735              :     case UINT_FTYPE_V8HF_INT:
   13736              :     case INT64_FTYPE_V2DF_INT:
   13737              :     case INT64_FTYPE_V4SF_INT:
   13738              :     case INT64_FTYPE_V8HF_INT:
   13739              :     case INT_FTYPE_V2DF_INT:
   13740              :     case INT_FTYPE_V4SF_INT:
   13741              :     case INT_FTYPE_V8HF_INT:
   13742              :       nargs = 2;
   13743              :       break;
   13744          656 :     case V32HF_FTYPE_V32HF_V32HF_INT:
   13745          656 :     case V8HF_FTYPE_V8HF_V8HF_INT:
   13746          656 :     case V8HF_FTYPE_V8HF_INT_INT:
   13747          656 :     case V8HF_FTYPE_V8HF_UINT_INT:
   13748          656 :     case V8HF_FTYPE_V8HF_INT64_INT:
   13749          656 :     case V8HF_FTYPE_V8HF_UINT64_INT:
   13750          656 :     case V4SF_FTYPE_V4SF_UINT_INT:
   13751          656 :     case V4SF_FTYPE_V4SF_UINT64_INT:
   13752          656 :     case V2DF_FTYPE_V2DF_UINT64_INT:
   13753          656 :     case V4SF_FTYPE_V4SF_INT_INT:
   13754          656 :     case V4SF_FTYPE_V4SF_INT64_INT:
   13755          656 :     case V2DF_FTYPE_V2DF_INT64_INT:
   13756          656 :     case V4SF_FTYPE_V4SF_V4SF_INT:
   13757          656 :     case V2DF_FTYPE_V2DF_V2DF_INT:
   13758          656 :     case V4SF_FTYPE_V4SF_V2DF_INT:
   13759          656 :     case V2DF_FTYPE_V2DF_V4SF_INT:
   13760          656 :       nargs = 3;
   13761          656 :       break;
   13762         4554 :     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
   13763         4554 :     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
   13764         4554 :     case V32HI_FTYPE_V32HF_V32HI_USI_INT:
   13765         4554 :     case V32HI_FTYPE_V32BF_V32HI_USI_INT:
   13766         4554 :     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
   13767         4554 :     case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
   13768         4554 :     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
   13769         4554 :     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
   13770         4554 :     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
   13771         4554 :     case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
   13772         4554 :     case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
   13773         4554 :     case V32HF_FTYPE_V32HI_V32HF_USI_INT:
   13774         4554 :     case V32HF_FTYPE_V32HF_V32HF_USI_INT:
   13775         4554 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
   13776         4554 :     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
   13777         4554 :     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
   13778         4554 :     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
   13779         4554 :     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
   13780         4554 :     case V16SI_FTYPE_V16SF_V16SI_UHI_INT:
   13781         4554 :     case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
   13782         4554 :     case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
   13783         4554 :     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
   13784         4554 :     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
   13785         4554 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
   13786         4554 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
   13787         4554 :     case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
   13788         4554 :     case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
   13789         4554 :     case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
   13790         4554 :     case V16HI_FTYPE_V16BF_V16HI_UHI_INT:
   13791         4554 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
   13792         4554 :       nargs = 4;
   13793         4554 :       break;
   13794          185 :     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
   13795          185 :     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
   13796          185 :       nargs_constant = 2;
   13797          185 :       nargs = 4;
   13798          185 :       break;
   13799          103 :     case INT_FTYPE_V4SF_V4SF_INT_INT:
   13800          103 :     case INT_FTYPE_V2DF_V2DF_INT_INT:
   13801          103 :       return ix86_expand_sse_comi_round (d, exp, target, true);
   13802         6238 :     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
   13803         6238 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
   13804         6238 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
   13805         6238 :     case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
   13806         6238 :     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
   13807         6238 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
   13808         6238 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
   13809         6238 :     case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
   13810         6238 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
   13811         6238 :     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
   13812         6238 :     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
   13813         6238 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
   13814         6238 :     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
   13815         6238 :     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
   13816         6238 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
   13817         6238 :     case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
   13818         6238 :     case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
   13819         6238 :     case V32HF_FTYPE_V16SF_V16SF_V32HF_USI_INT:
   13820         6238 :       nargs = 5;
   13821         6238 :       break;
   13822          635 :     case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
   13823          635 :     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
   13824          635 :     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
   13825          635 :     case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
   13826          635 :     case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
   13827          635 :       nargs_constant = 4;
   13828          635 :       nargs = 5;
   13829          635 :       break;
   13830         1181 :     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
   13831         1181 :     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
   13832         1181 :     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
   13833         1181 :     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
   13834         1181 :     case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
   13835         1181 :     case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
   13836         1181 :       nargs_constant = 3;
   13837         1181 :       nargs = 5;
   13838         1181 :       break;
   13839         1071 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
   13840         1071 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
   13841         1071 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
   13842         1071 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
   13843         1071 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
   13844         1071 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
   13845         1071 :     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
   13846         1071 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI_INT:
   13847         1071 :     case V32HF_FTYPE_V32HF_V32HF_INT_V32HF_USI_INT:
   13848         1071 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI_INT:
   13849         1071 :       nargs = 6;
   13850         1071 :       nargs_constant = 4;
   13851         1071 :       break;
   13852          252 :     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
   13853          252 :     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
   13854          252 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
   13855          252 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
   13856          252 :       nargs = 6;
   13857          252 :       nargs_constant = 3;
   13858          252 :       break;
   13859            0 :     default:
   13860            0 :       gcc_unreachable ();
   13861              :     }
   13862        14772 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   13863              : 
   13864        15501 :   if (optimize
   13865         4265 :       || target == 0
   13866         4265 :       || GET_MODE (target) != tmode
   13867        19766 :       || !insn_p->operand[0].predicate (target, tmode))
   13868        11236 :     target = gen_reg_rtx (tmode);
   13869              : 
   13870        85440 :   for (i = 0; i < nargs; i++)
   13871              :     {
   13872        70494 :       tree arg = CALL_EXPR_ARG (exp, i);
   13873        70494 :       rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
   13874        70494 :       machine_mode mode = insn_p->operand[i + 1].mode;
   13875        70494 :       bool match = insn_p->operand[i + 1].predicate (op, mode);
   13876              : 
   13877        70494 :       if (i == nargs - nargs_constant)
   13878              :         {
   13879         3324 :           if (!match)
   13880              :             {
   13881           40 :               switch (icode)
   13882              :                 {
   13883           12 :                 case CODE_FOR_avx512f_getmantv8df_mask_round:
   13884           12 :                 case CODE_FOR_avx512f_getmantv16sf_mask_round:
   13885           12 :                 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
   13886           12 :                 case CODE_FOR_avx512f_vgetmantv2df_round:
   13887           12 :                 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
   13888           12 :                 case CODE_FOR_avx512f_vgetmantv4sf_round:
   13889           12 :                 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
   13890           12 :                 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
   13891           12 :                   error ("the immediate argument must be a 4-bit immediate");
   13892           12 :                   return const0_rtx;
   13893            8 :                 case CODE_FOR_avx512f_cmpv8df3_mask_round:
   13894            8 :                 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
   13895            8 :                 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
   13896            8 :                 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
   13897            8 :                 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
   13898            8 :                 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
   13899            8 :                   error ("the immediate argument must be a 5-bit immediate");
   13900            8 :                   return const0_rtx;
   13901           20 :                 default:
   13902           20 :                   error ("the immediate argument must be an 8-bit immediate");
   13903           20 :                   return const0_rtx;
   13904              :                 }
   13905              :             }
   13906              :         }
   13907        67170 :       else if (i == nargs-1)
   13908              :         {
   13909        15461 :           if (!insn_p->operand[nargs].predicate (op, SImode))
   13910              :             {
   13911          515 :               error ("incorrect rounding operand");
   13912          515 :               return const0_rtx;
   13913              :             }
   13914              : 
   13915              :           /* If there is no rounding use normal version of the pattern.  */
   13916        14946 :           if (INTVAL (op) == NO_ROUND)
   13917              :             {
   13918              :               /* Skip erasing embedded rounding for below expanders who
   13919              :                  generates multiple insns.  In ix86_erase_embedded_rounding
   13920              :                  the pattern will be transformed to a single set, and emit_insn
   13921              :                  appends the set instead of insert it to chain.  So the insns
   13922              :                  emitted inside define_expander would be ignored.  */
   13923         4980 :               switch (icode)
   13924              :                 {
   13925              :                 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
   13926              :                 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
   13927              :                 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
   13928              :                 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
   13929              :                 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
   13930              :                 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
   13931              :                   redundant_embed_rnd = 0;
   13932              :                   break;
   13933         4948 :                 default:
   13934         4948 :                   redundant_embed_rnd = 1;
   13935         4948 :                   break;
   13936              :                 }
   13937              :             }
   13938              :         }
   13939              :       else
   13940              :         {
   13941        51709 :           if (VECTOR_MODE_P (mode))
   13942        37787 :             op = safe_vector_operand (op, mode);
   13943              : 
   13944        51709 :           op = fixup_modeless_constant (op, mode);
   13945              : 
   13946        51709 :           if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   13947              :             {
   13948        51709 :               if (optimize || !match)
   13949        45381 :                 op = copy_to_mode_reg (mode, op);
   13950              :             }
   13951              :           else
   13952              :             {
   13953            0 :               op = copy_to_reg (op);
   13954            0 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   13955              :             }
   13956              :         }
   13957              : 
   13958        69939 :       xops[i] = op;
   13959              :     }
   13960              : 
   13961        14946 :   switch (nargs)
   13962              :     {
   13963              :     case 1:
   13964              :       pat = GEN_FCN (icode) (target, xops[0]);
   13965              :       break;
   13966          696 :     case 2:
   13967          696 :       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   13968          696 :       break;
   13969          612 :     case 3:
   13970          612 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   13971          612 :       break;
   13972         4615 :     case 4:
   13973         4615 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   13974         4615 :                              xops[2], xops[3]);
   13975         4615 :       break;
   13976         7750 :     case 5:
   13977         7750 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   13978         7750 :                              xops[2], xops[3], xops[4]);
   13979         7750 :       break;
   13980         1273 :     case 6:
   13981         1273 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   13982         1273 :                              xops[2], xops[3], xops[4], xops[5]);
   13983         1273 :       break;
   13984              :     default:
   13985              :       gcc_unreachable ();
   13986              :     }
   13987              : 
   13988        14946 :   if (!pat)
   13989              :     return 0;
   13990              : 
   13991        14946 :   if (redundant_embed_rnd)
   13992         4948 :     pat = ix86_erase_embedded_rounding (pat);
   13993              : 
   13994        14946 :   emit_insn (pat);
   13995        14946 :   return target;
   13996              : }
   13997              : 
   13998              : /* Subroutine of ix86_expand_builtin to take care of special insns
   13999              :    with variable number of operands.  */
   14000              : 
   14001              : static rtx
   14002        27190 : ix86_expand_special_args_builtin (const struct builtin_description *d,
   14003              :                                   tree exp, rtx target)
   14004              : {
   14005        27190 :   tree arg;
   14006        27190 :   rtx pat, op;
   14007        27190 :   unsigned int i, nargs, arg_adjust, memory;
   14008        27190 :   unsigned int constant = 100;
   14009        27190 :   bool aligned_mem = false;
   14010        27190 :   rtx xops[4];
   14011        27190 :   enum insn_code icode = d->icode;
   14012        27190 :   const struct insn_data_d *insn_p = &insn_data[icode];
   14013        27190 :   machine_mode tmode = insn_p->operand[0].mode;
   14014        27190 :   enum { load, store } klass;
   14015              : 
   14016        27190 :   switch ((enum ix86_builtin_func_type) d->flag)
   14017              :     {
   14018        15380 :     case VOID_FTYPE_VOID:
   14019        15380 :       emit_insn (GEN_FCN (icode) (target));
   14020        15380 :       return 0;
   14021              :     case VOID_FTYPE_UINT64:
   14022              :     case VOID_FTYPE_UNSIGNED:
   14023              :       nargs = 0;
   14024              :       klass = store;
   14025              :       memory = 0;
   14026              :       break;
   14027              : 
   14028         7581 :     case INT_FTYPE_VOID:
   14029         7581 :     case USHORT_FTYPE_VOID:
   14030         7581 :     case UINT64_FTYPE_VOID:
   14031         7581 :     case UINT_FTYPE_VOID:
   14032         7581 :     case UINT8_FTYPE_VOID:
   14033         7581 :     case UNSIGNED_FTYPE_VOID:
   14034         7581 :       nargs = 0;
   14035         7581 :       klass = load;
   14036         7581 :       memory = 0;
   14037         7581 :       break;
   14038          359 :     case CHAR_FTYPE_PCCHAR:
   14039          359 :     case SHORT_FTYPE_PCSHORT:
   14040          359 :     case INT_FTYPE_PCINT:
   14041          359 :     case INT64_FTYPE_PCINT64:
   14042          359 :     case UINT64_FTYPE_PUNSIGNED:
   14043          359 :     case V2DI_FTYPE_PV2DI:
   14044          359 :     case V4DI_FTYPE_PV4DI:
   14045          359 :     case V32QI_FTYPE_PCCHAR:
   14046          359 :     case V16QI_FTYPE_PCCHAR:
   14047          359 :     case V8SF_FTYPE_PCV4SF:
   14048          359 :     case V8SF_FTYPE_PCFLOAT:
   14049          359 :     case V4SF_FTYPE_PCFLOAT:
   14050          359 :     case V4SF_FTYPE_PCFLOAT16:
   14051          359 :     case V4SF_FTYPE_PCBFLOAT16:
   14052          359 :     case V4SF_FTYPE_PCV8BF:
   14053          359 :     case V4SF_FTYPE_PCV8HF:
   14054          359 :     case V8SF_FTYPE_PCFLOAT16:
   14055          359 :     case V8SF_FTYPE_PCBFLOAT16:
   14056          359 :     case V8SF_FTYPE_PCV16HF:
   14057          359 :     case V8SF_FTYPE_PCV16BF:
   14058          359 :     case V4DF_FTYPE_PCV2DF:
   14059          359 :     case V4DF_FTYPE_PCDOUBLE:
   14060          359 :     case V2DF_FTYPE_PCDOUBLE:
   14061          359 :     case VOID_FTYPE_PVOID:
   14062          359 :     case V8DI_FTYPE_PV8DI:
   14063          359 :       nargs = 1;
   14064          359 :       klass = load;
   14065          359 :       memory = 0;
   14066          359 :       switch (icode)
   14067              :         {
   14068              :         case CODE_FOR_sse4_1_movntdqa:
   14069              :         case CODE_FOR_avx2_movntdqa:
   14070              :         case CODE_FOR_avx512f_movntdqa:
   14071              :           aligned_mem = true;
   14072              :           break;
   14073              :         default:
   14074              :           break;
   14075              :         }
   14076              :       break;
   14077          371 :     case VOID_FTYPE_PV2SF_V4SF:
   14078          371 :     case VOID_FTYPE_PV8DI_V8DI:
   14079          371 :     case VOID_FTYPE_PV4DI_V4DI:
   14080          371 :     case VOID_FTYPE_PV2DI_V2DI:
   14081          371 :     case VOID_FTYPE_PCHAR_V32QI:
   14082          371 :     case VOID_FTYPE_PCHAR_V16QI:
   14083          371 :     case VOID_FTYPE_PFLOAT_V16SF:
   14084          371 :     case VOID_FTYPE_PFLOAT_V8SF:
   14085          371 :     case VOID_FTYPE_PFLOAT_V4SF:
   14086          371 :     case VOID_FTYPE_PDOUBLE_V8DF:
   14087          371 :     case VOID_FTYPE_PDOUBLE_V4DF:
   14088          371 :     case VOID_FTYPE_PDOUBLE_V2DF:
   14089          371 :     case VOID_FTYPE_PLONGLONG_LONGLONG:
   14090          371 :     case VOID_FTYPE_PULONGLONG_ULONGLONG:
   14091          371 :     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
   14092          371 :     case VOID_FTYPE_PINT_INT:
   14093          371 :       nargs = 1;
   14094          371 :       klass = store;
   14095              :       /* Reserve memory operand for target.  */
   14096          371 :       memory = ARRAY_SIZE (xops);
   14097          371 :       switch (icode)
   14098              :         {
   14099              :         /* These builtins and instructions require the memory
   14100              :            to be properly aligned.  */
   14101              :         case CODE_FOR_avx_movntv4di:
   14102              :         case CODE_FOR_sse2_movntv2di:
   14103              :         case CODE_FOR_avx_movntv8sf:
   14104              :         case CODE_FOR_sse_movntv4sf:
   14105              :         case CODE_FOR_sse4a_vmmovntv4sf:
   14106              :         case CODE_FOR_avx_movntv4df:
   14107              :         case CODE_FOR_sse2_movntv2df:
   14108              :         case CODE_FOR_sse4a_vmmovntv2df:
   14109              :         case CODE_FOR_sse2_movntidi:
   14110              :         case CODE_FOR_sse_movntq:
   14111              :         case CODE_FOR_sse2_movntisi:
   14112              :         case CODE_FOR_avx512f_movntv16sf:
   14113              :         case CODE_FOR_avx512f_movntv8df:
   14114              :         case CODE_FOR_avx512f_movntv8di:
   14115              :           aligned_mem = true;
   14116              :           break;
   14117              :         default:
   14118              :           break;
   14119              :         }
   14120              :       break;
   14121            0 :     case VOID_FTYPE_PVOID_PCVOID:
   14122            0 :         nargs = 1;
   14123            0 :         klass = store;
   14124            0 :         memory = 0;
   14125              : 
   14126            0 :         break;
   14127           26 :     case V4SF_FTYPE_V4SF_PCV2SF:
   14128           26 :     case V2DF_FTYPE_V2DF_PCDOUBLE:
   14129           26 :       nargs = 2;
   14130           26 :       klass = load;
   14131           26 :       memory = 1;
   14132           26 :       break;
   14133           93 :     case V8SF_FTYPE_PCV8SF_V8SI:
   14134           93 :     case V4DF_FTYPE_PCV4DF_V4DI:
   14135           93 :     case V4SF_FTYPE_PCV4SF_V4SI:
   14136           93 :     case V2DF_FTYPE_PCV2DF_V2DI:
   14137           93 :     case V8SI_FTYPE_PCV8SI_V8SI:
   14138           93 :     case V4DI_FTYPE_PCV4DI_V4DI:
   14139           93 :     case V4SI_FTYPE_PCV4SI_V4SI:
   14140           93 :     case V2DI_FTYPE_PCV2DI_V2DI:
   14141           93 :     case VOID_FTYPE_INT_INT64:
   14142           93 :       nargs = 2;
   14143           93 :       klass = load;
   14144           93 :       memory = 0;
   14145           93 :       break;
   14146          360 :     case VOID_FTYPE_PV8DF_V8DF_UQI:
   14147          360 :     case VOID_FTYPE_PV4DF_V4DF_UQI:
   14148          360 :     case VOID_FTYPE_PV2DF_V2DF_UQI:
   14149          360 :     case VOID_FTYPE_PV16SF_V16SF_UHI:
   14150          360 :     case VOID_FTYPE_PV8SF_V8SF_UQI:
   14151          360 :     case VOID_FTYPE_PV4SF_V4SF_UQI:
   14152          360 :     case VOID_FTYPE_PV8DI_V8DI_UQI:
   14153          360 :     case VOID_FTYPE_PV4DI_V4DI_UQI:
   14154          360 :     case VOID_FTYPE_PV2DI_V2DI_UQI:
   14155          360 :     case VOID_FTYPE_PV16SI_V16SI_UHI:
   14156          360 :     case VOID_FTYPE_PV8SI_V8SI_UQI:
   14157          360 :     case VOID_FTYPE_PV4SI_V4SI_UQI:
   14158          360 :     case VOID_FTYPE_PV64QI_V64QI_UDI:
   14159          360 :     case VOID_FTYPE_PV32HI_V32HI_USI:
   14160          360 :     case VOID_FTYPE_PV32QI_V32QI_USI:
   14161          360 :     case VOID_FTYPE_PV16QI_V16QI_UHI:
   14162          360 :     case VOID_FTYPE_PV16HI_V16HI_UHI:
   14163          360 :     case VOID_FTYPE_PV8HI_V8HI_UQI:
   14164          360 :       switch (icode)
   14165              :         {
   14166              :         /* These builtins and instructions require the memory
   14167              :            to be properly aligned.  */
   14168              :         case CODE_FOR_avx512f_storev16sf_mask:
   14169              :         case CODE_FOR_avx512f_storev16si_mask:
   14170              :         case CODE_FOR_avx512f_storev8df_mask:
   14171              :         case CODE_FOR_avx512f_storev8di_mask:
   14172              :         case CODE_FOR_avx512vl_storev8sf_mask:
   14173              :         case CODE_FOR_avx512vl_storev8si_mask:
   14174              :         case CODE_FOR_avx512vl_storev4df_mask:
   14175              :         case CODE_FOR_avx512vl_storev4di_mask:
   14176              :         case CODE_FOR_avx512vl_storev4sf_mask:
   14177              :         case CODE_FOR_avx512vl_storev4si_mask:
   14178              :         case CODE_FOR_avx512vl_storev2df_mask:
   14179              :         case CODE_FOR_avx512vl_storev2di_mask:
   14180        11810 :           aligned_mem = true;
   14181              :           break;
   14182              :         default:
   14183              :           break;
   14184              :         }
   14185              :       /* FALLTHRU */
   14186              :     case VOID_FTYPE_PV8SF_V8SI_V8SF:
   14187              :     case VOID_FTYPE_PV4DF_V4DI_V4DF:
   14188              :     case VOID_FTYPE_PV4SF_V4SI_V4SF:
   14189              :     case VOID_FTYPE_PV2DF_V2DI_V2DF:
   14190              :     case VOID_FTYPE_PV8SI_V8SI_V8SI:
   14191              :     case VOID_FTYPE_PV4DI_V4DI_V4DI:
   14192              :     case VOID_FTYPE_PV4SI_V4SI_V4SI:
   14193              :     case VOID_FTYPE_PV2DI_V2DI_V2DI:
   14194              :     case VOID_FTYPE_PV8SI_V8DI_UQI:
   14195              :     case VOID_FTYPE_PV8HI_V8DI_UQI:
   14196              :     case VOID_FTYPE_PV16HI_V16SI_UHI:
   14197              :     case VOID_FTYPE_PUDI_V8DI_UQI:
   14198              :     case VOID_FTYPE_PV16QI_V16SI_UHI:
   14199              :     case VOID_FTYPE_PV4SI_V4DI_UQI:
   14200              :     case VOID_FTYPE_PUDI_V2DI_UQI:
   14201              :     case VOID_FTYPE_PUDI_V4DI_UQI:
   14202              :     case VOID_FTYPE_PUSI_V2DI_UQI:
   14203              :     case VOID_FTYPE_PV8HI_V8SI_UQI:
   14204              :     case VOID_FTYPE_PUDI_V4SI_UQI:
   14205              :     case VOID_FTYPE_PUSI_V4DI_UQI:
   14206              :     case VOID_FTYPE_PUHI_V2DI_UQI:
   14207              :     case VOID_FTYPE_PUDI_V8SI_UQI:
   14208              :     case VOID_FTYPE_PUSI_V4SI_UQI:
   14209              :     case VOID_FTYPE_PCHAR_V64QI_UDI:
   14210              :     case VOID_FTYPE_PCHAR_V32QI_USI:
   14211              :     case VOID_FTYPE_PCHAR_V16QI_UHI:
   14212              :     case VOID_FTYPE_PSHORT_V32HI_USI:
   14213              :     case VOID_FTYPE_PSHORT_V16HI_UHI:
   14214              :     case VOID_FTYPE_PSHORT_V8HI_UQI:
   14215              :     case VOID_FTYPE_PINT_V16SI_UHI:
   14216              :     case VOID_FTYPE_PINT_V8SI_UQI:
   14217              :     case VOID_FTYPE_PINT_V4SI_UQI:
   14218              :     case VOID_FTYPE_PINT64_V8DI_UQI:
   14219              :     case VOID_FTYPE_PINT64_V4DI_UQI:
   14220              :     case VOID_FTYPE_PINT64_V2DI_UQI:
   14221              :     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
   14222              :     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
   14223              :     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
   14224              :     case VOID_FTYPE_PFLOAT_V16SF_UHI:
   14225              :     case VOID_FTYPE_PFLOAT_V8SF_UQI:
   14226              :     case VOID_FTYPE_PFLOAT_V4SF_UQI:
   14227              :     case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
   14228              :     case VOID_FTYPE_PV32QI_V32HI_USI:
   14229              :     case VOID_FTYPE_PV16QI_V16HI_UHI:
   14230              :     case VOID_FTYPE_PUDI_V8HI_UQI:
   14231              :       nargs = 2;
   14232              :       klass = store;
   14233              :       /* Reserve memory operand for target.  */
   14234              :       memory = ARRAY_SIZE (xops);
   14235              :       break;
   14236         1243 :     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
   14237         1243 :     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
   14238         1243 :     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
   14239         1243 :     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
   14240         1243 :     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
   14241         1243 :     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
   14242         1243 :     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
   14243         1243 :     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
   14244         1243 :     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
   14245         1243 :     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
   14246         1243 :     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
   14247         1243 :     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
   14248         1243 :     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
   14249         1243 :     case V32HI_FTYPE_PCV32HI_V32HI_USI:
   14250         1243 :     case V32QI_FTYPE_PCV32QI_V32QI_USI:
   14251         1243 :     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
   14252         1243 :     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
   14253         1243 :     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
   14254         1243 :       switch (icode)
   14255              :         {
   14256              :         /* These builtins and instructions require the memory
   14257              :            to be properly aligned.  */
   14258              :         case CODE_FOR_avx512f_loadv16sf_mask:
   14259              :         case CODE_FOR_avx512f_loadv16si_mask:
   14260              :         case CODE_FOR_avx512f_loadv8df_mask:
   14261              :         case CODE_FOR_avx512f_loadv8di_mask:
   14262              :         case CODE_FOR_avx512vl_loadv8sf_mask:
   14263              :         case CODE_FOR_avx512vl_loadv8si_mask:
   14264              :         case CODE_FOR_avx512vl_loadv4df_mask:
   14265              :         case CODE_FOR_avx512vl_loadv4di_mask:
   14266              :         case CODE_FOR_avx512vl_loadv4sf_mask:
   14267              :         case CODE_FOR_avx512vl_loadv4si_mask:
   14268              :         case CODE_FOR_avx512vl_loadv2df_mask:
   14269              :         case CODE_FOR_avx512vl_loadv2di_mask:
   14270              :         case CODE_FOR_avx512bw_loadv64qi_mask:
   14271              :         case CODE_FOR_avx512vl_loadv32qi_mask:
   14272              :         case CODE_FOR_avx512vl_loadv16qi_mask:
   14273              :         case CODE_FOR_avx512bw_loadv32hi_mask:
   14274              :         case CODE_FOR_avx512vl_loadv16hi_mask:
   14275              :         case CODE_FOR_avx512vl_loadv8hi_mask:
   14276        11810 :           aligned_mem = true;
   14277              :           break;
   14278              :         default:
   14279              :           break;
   14280              :         }
   14281              :       /* FALLTHRU */
   14282              :     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
   14283              :     case V32QI_FTYPE_PCCHAR_V32QI_USI:
   14284              :     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
   14285              :     case V32HI_FTYPE_PCSHORT_V32HI_USI:
   14286              :     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
   14287              :     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
   14288              :     case V16SI_FTYPE_PCINT_V16SI_UHI:
   14289              :     case V8SI_FTYPE_PCINT_V8SI_UQI:
   14290              :     case V4SI_FTYPE_PCINT_V4SI_UQI:
   14291              :     case V8DI_FTYPE_PCINT64_V8DI_UQI:
   14292              :     case V4DI_FTYPE_PCINT64_V4DI_UQI:
   14293              :     case V2DI_FTYPE_PCINT64_V2DI_UQI:
   14294              :     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
   14295              :     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
   14296              :     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
   14297              :     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
   14298              :     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
   14299              :     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
   14300              :     case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
   14301              :       nargs = 3;
   14302              :       klass = load;
   14303              :       memory = 0;
   14304              :       break;
   14305          105 :     case INT_FTYPE_PINT_INT_INT_INT:
   14306          105 :     case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
   14307          105 :       nargs = 4;
   14308          105 :       klass = load;
   14309          105 :       memory = 0;
   14310          105 :       constant = 3;
   14311          105 :       break;
   14312            0 :     default:
   14313            0 :       gcc_unreachable ();
   14314              :     }
   14315              : 
   14316         8339 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   14317              : 
   14318        11810 :   if (klass == store)
   14319              :     {
   14320         1875 :       arg = CALL_EXPR_ARG (exp, 0);
   14321         1875 :       op = expand_normal (arg);
   14322         1875 :       gcc_assert (target == 0);
   14323         1875 :       if (memory)
   14324              :         {
   14325         1715 :           op = ix86_zero_extend_to_Pmode (op);
   14326         1715 :           target = gen_rtx_MEM (tmode, op);
   14327              :           /* target at this point has just BITS_PER_UNIT MEM_ALIGN
   14328              :              on it.  Try to improve it using get_pointer_alignment,
   14329              :              and if the special builtin is one that requires strict
   14330              :              mode alignment, also from it's GET_MODE_ALIGNMENT.
   14331              :              Failure to do so could lead to ix86_legitimate_combined_insn
   14332              :              rejecting all changes to such insns.  */
   14333         1715 :           unsigned int align = get_pointer_alignment (arg);
   14334         1715 :           if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
   14335          275 :             align = GET_MODE_ALIGNMENT (tmode);
   14336         3430 :           if (MEM_ALIGN (target) < align)
   14337          422 :             set_mem_align (target, align);
   14338              :         }
   14339              :       else
   14340          160 :         target = force_reg (tmode, op);
   14341              :       arg_adjust = 1;
   14342              :     }
   14343              :   else
   14344              :     {
   14345         9935 :       arg_adjust = 0;
   14346         9935 :       if (optimize
   14347         2918 :           || target == 0
   14348         2918 :           || !register_operand (target, tmode)
   14349        12842 :           || GET_MODE (target) != tmode)
   14350         7028 :         target = gen_reg_rtx (tmode);
   14351              :     }
   14352              : 
   14353        21199 :   for (i = 0; i < nargs; i++)
   14354              :     {
   14355         9389 :       machine_mode mode = insn_p->operand[i + 1].mode;
   14356              : 
   14357         9389 :       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
   14358         9389 :       op = ix86_expand_unsigned_small_int_cst_argument (arg);
   14359              : 
   14360         9389 :       if (i == memory)
   14361              :         {
   14362              :           /* This must be the memory operand.  */
   14363         2354 :           op = ix86_zero_extend_to_Pmode (op);
   14364         2354 :           op = gen_rtx_MEM (mode, op);
   14365              :           /* op at this point has just BITS_PER_UNIT MEM_ALIGN
   14366              :              on it.  Try to improve it using get_pointer_alignment,
   14367              :              and if the special builtin is one that requires strict
   14368              :              mode alignment, also from it's GET_MODE_ALIGNMENT.
   14369              :              Failure to do so could lead to ix86_legitimate_combined_insn
   14370              :              rejecting all changes to such insns.  */
   14371         2354 :           unsigned int align = get_pointer_alignment (arg);
   14372         2354 :           if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
   14373          299 :             align = GET_MODE_ALIGNMENT (mode);
   14374         4708 :           if (MEM_ALIGN (op) < align)
   14375          523 :             set_mem_align (op, align);
   14376              :         }
   14377         7035 :       else if (i == constant)
   14378              :         {
   14379              :           /* This must be the constant.  */
   14380          105 :           if (!insn_p->operand[nargs].predicate(op, SImode))
   14381              :             {
   14382            0 :               error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
   14383            0 :               return const0_rtx;
   14384              :             }
   14385              :         }
   14386              :       else
   14387              :         {
   14388              :           /* This must be register.  */
   14389         6930 :           if (VECTOR_MODE_P (mode))
   14390         3475 :             op = safe_vector_operand (op, mode);
   14391              : 
   14392         6930 :           op = fixup_modeless_constant (op, mode);
   14393              : 
   14394              :           /* NB: 3-operands load implied it's a mask load or v{p}expand*,
   14395              :              and that mask operand shoud be at the end.
   14396              :              Keep all-ones mask which would be simplified by the expander.  */
   14397         1771 :           if (nargs == 3 && i == 2 && klass == load
   14398         1771 :               && constm1_operand (op, mode)
   14399         7103 :               && insn_p->operand[i].predicate (op, mode))
   14400              :             ;
   14401         6930 :           else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   14402         6930 :             op = copy_to_mode_reg (mode, op);
   14403              :           else
   14404              :             {
   14405            0 :               op = copy_to_reg (op);
   14406            0 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   14407              :             }
   14408              :         }
   14409              : 
   14410         9389 :       xops[i]= op;
   14411              :     }
   14412              : 
   14413        11810 :   switch (nargs)
   14414              :     {
   14415         7741 :     case 0:
   14416         7741 :       pat = GEN_FCN (icode) (target);
   14417         7741 :       break;
   14418          730 :     case 1:
   14419          730 :       pat = GEN_FCN (icode) (target, xops[0]);
   14420          730 :       break;
   14421         1463 :     case 2:
   14422         1463 :       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   14423         1463 :       break;
   14424         1771 :     case 3:
   14425         1771 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   14426         1771 :       break;
   14427          105 :     case 4:
   14428          105 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
   14429          105 :       break;
   14430              :     default:
   14431              :       gcc_unreachable ();
   14432              :     }
   14433              : 
   14434        11810 :   if (! pat)
   14435              :     return 0;
   14436              : 
   14437        11810 :   emit_insn (pat);
   14438        11810 :   return klass == store ? 0 : target;
   14439              : }
   14440              : 
   14441              : /* Return the integer constant in ARG.  Constrain it to be in the range
   14442              :    of the subparts of VEC_TYPE; issue an error if not.  */
   14443              : 
   14444              : static int
   14445          604 : get_element_number (tree vec_type, tree arg)
   14446              : {
   14447          604 :   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
   14448              : 
   14449          604 :   if (!tree_fits_uhwi_p (arg)
   14450          604 :       || (elt = tree_to_uhwi (arg), elt > max))
   14451              :     {
   14452            0 :       error ("selector must be an integer constant in the range "
   14453              :              "[0, %wi]", max);
   14454            0 :       return 0;
   14455              :     }
   14456              : 
   14457          604 :   return elt;
   14458              : }
   14459              : 
   14460              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14461              :    ix86_expand_vector_init.  We DO have language-level syntax for this, in
   14462              :    the form of  (type){ init-list }.  Except that since we can't place emms
   14463              :    instructions from inside the compiler, we can't allow the use of MMX
   14464              :    registers unless the user explicitly asks for it.  So we do *not* define
   14465              :    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
   14466              :    we have builtins invoked by mmintrin.h that gives us license to emit
   14467              :    these sorts of instructions.  */
   14468              : 
   14469              : static rtx
   14470          229 : ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
   14471              : {
   14472          229 :   machine_mode tmode = TYPE_MODE (type);
   14473          229 :   machine_mode inner_mode = GET_MODE_INNER (tmode);
   14474          229 :   int i, n_elt = GET_MODE_NUNITS (tmode);
   14475          229 :   rtvec v = rtvec_alloc (n_elt);
   14476              : 
   14477          229 :   gcc_assert (VECTOR_MODE_P (tmode));
   14478          229 :   gcc_assert (call_expr_nargs (exp) == n_elt);
   14479              : 
   14480         1203 :   for (i = 0; i < n_elt; ++i)
   14481              :     {
   14482          974 :       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
   14483          974 :       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
   14484              :     }
   14485              : 
   14486          229 :   if (!target || !register_operand (target, tmode))
   14487            0 :     target = gen_reg_rtx (tmode);
   14488              : 
   14489          229 :   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
   14490          229 :   return target;
   14491              : }
   14492              : 
   14493              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14494              :    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
   14495              :    had a language-level syntax for referencing vector elements.  */
   14496              : 
   14497              : static rtx
   14498          400 : ix86_expand_vec_ext_builtin (tree exp, rtx target)
   14499              : {
   14500          400 :   machine_mode tmode, mode0;
   14501          400 :   tree arg0, arg1;
   14502          400 :   int elt;
   14503          400 :   rtx op0;
   14504              : 
   14505          400 :   arg0 = CALL_EXPR_ARG (exp, 0);
   14506          400 :   arg1 = CALL_EXPR_ARG (exp, 1);
   14507              : 
   14508          400 :   op0 = expand_normal (arg0);
   14509          400 :   elt = get_element_number (TREE_TYPE (arg0), arg1);
   14510              : 
   14511          400 :   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
   14512          400 :   mode0 = TYPE_MODE (TREE_TYPE (arg0));
   14513          400 :   gcc_assert (VECTOR_MODE_P (mode0));
   14514              : 
   14515          400 :   op0 = force_reg (mode0, op0);
   14516              : 
   14517          400 :   if (optimize || !target || !register_operand (target, tmode))
   14518          321 :     target = gen_reg_rtx (tmode);
   14519              : 
   14520          400 :   ix86_expand_vector_extract (true, target, op0, elt);
   14521              : 
   14522          400 :   return target;
   14523              : }
   14524              : 
   14525              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14526              :    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
   14527              :    a language-level syntax for referencing vector elements.  */
   14528              : 
   14529              : static rtx
   14530          204 : ix86_expand_vec_set_builtin (tree exp)
   14531              : {
   14532          204 :   machine_mode tmode, mode1;
   14533          204 :   tree arg0, arg1, arg2;
   14534          204 :   int elt;
   14535          204 :   rtx op0, op1, target;
   14536              : 
   14537          204 :   arg0 = CALL_EXPR_ARG (exp, 0);
   14538          204 :   arg1 = CALL_EXPR_ARG (exp, 1);
   14539          204 :   arg2 = CALL_EXPR_ARG (exp, 2);
   14540              : 
   14541          204 :   tmode = TYPE_MODE (TREE_TYPE (arg0));
   14542          204 :   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
   14543          204 :   gcc_assert (VECTOR_MODE_P (tmode));
   14544              : 
   14545          204 :   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
   14546          204 :   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
   14547          204 :   elt = get_element_number (TREE_TYPE (arg0), arg2);
   14548              : 
   14549          204 :   if (GET_MODE (op1) != mode1)
   14550           82 :     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
   14551              : 
   14552          204 :   op0 = force_reg (tmode, op0);
   14553          204 :   op1 = force_reg (mode1, op1);
   14554              : 
   14555              :   /* OP0 is the source of these builtin functions and shouldn't be
   14556              :      modified.  Create a copy, use it and return it as target.  */
   14557          204 :   target = gen_reg_rtx (tmode);
   14558          204 :   emit_move_insn (target, op0);
   14559          204 :   ix86_expand_vector_set (true, target, op1, elt);
   14560              : 
   14561          204 :   return target;
   14562              : }
   14563              : 
   14564              : /* Return true if the necessary isa options for this builtin exist,
   14565              :    else false.
   14566              :    fcode = DECL_MD_FUNCTION_CODE (fndecl);  */
   14567              : bool
   14568      1294707 : ix86_check_builtin_isa_match (unsigned int fcode,
   14569              :                               HOST_WIDE_INT* pbisa,
   14570              :                               HOST_WIDE_INT* pbisa2)
   14571              : {
   14572      1294707 :   HOST_WIDE_INT isa = ix86_isa_flags;
   14573      1294707 :   HOST_WIDE_INT isa2 = ix86_isa_flags2;
   14574      1294707 :   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
   14575      1294707 :   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
   14576      1294707 :   HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
   14577              :   /* The general case is we require all the ISAs specified in bisa{,2}
   14578              :      to be enabled.
   14579              :      The exceptions are:
   14580              :      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
   14581              :      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
   14582              :      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
   14583              :      (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
   14584              :        OPTION_MASK_ISA2_AVXVNNI
   14585              :      (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
   14586              :        OPTION_MASK_ISA2_AVXIFMA
   14587              :      (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
   14588              :        OPTION_MASK_ISA2_AVXNECONVERT
   14589              :      OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
   14590              :      OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT8
   14591              :      OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT16
   14592              :      where for each such pair it is sufficient if either of the ISAs is
   14593              :      enabled, plus if it is ored with other options also those others.
   14594              :      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
   14595              : 
   14596              : #define SHARE_BUILTIN(A1, A2, B1, B2) \
   14597              :   if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
   14598              :        && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
   14599              :       && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
   14600              :           || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
   14601              :     { \
   14602              :       tmp_isa |= (A1) | (B1); \
   14603              :       tmp_isa2 |= (A2) | (B2); \
   14604              :     }
   14605              : 
   14606      1294707 :   SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
   14607      1294707 :   SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
   14608      1294707 :   SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
   14609      1294707 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
   14610      1294707 :                  OPTION_MASK_ISA2_AVXVNNI);
   14611      1294707 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
   14612      1294707 :                  OPTION_MASK_ISA2_AVXIFMA);
   14613      1294707 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
   14614      1294707 :                  OPTION_MASK_ISA2_AVXNECONVERT);
   14615      1294707 :   SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
   14616      1294707 :                  OPTION_MASK_ISA2_VAES);
   14617      1294707 :   SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT8, 0,
   14618      1294707 :                  OPTION_MASK_ISA2_AVX10_2);
   14619      1294707 :   SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT16, 0,
   14620      1294707 :                  OPTION_MASK_ISA2_AVX10_2);
   14621      1294707 :   isa = tmp_isa;
   14622      1294707 :   isa2 = tmp_isa2;
   14623              : 
   14624      1294707 :   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
   14625              :       /* __builtin_ia32_maskmovq requires MMX registers.  */
   14626         4563 :       && fcode != IX86_BUILTIN_MASKMOVQ)
   14627              :     {
   14628         4554 :       bisa &= ~OPTION_MASK_ISA_MMX;
   14629         4554 :       bisa |= OPTION_MASK_ISA_SSE2;
   14630              :     }
   14631              : 
   14632      1294707 :   if (pbisa)
   14633       173272 :     *pbisa = bisa;
   14634      1294707 :   if (pbisa2)
   14635       173272 :     *pbisa2 = bisa2;
   14636              : 
   14637      1294707 :   return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
   14638              : }
   14639              : 
   14640              : /* Emit instructions to set the carry flag from ARG.  */
   14641              : 
   14642              : void
   14643        13267 : ix86_expand_carry (rtx arg)
   14644              : {
   14645        13267 :   if (!CONST_INT_P (arg) || arg == const0_rtx)
   14646              :     {
   14647        13261 :       arg = convert_to_mode (QImode, arg, 1);
   14648        13261 :       arg = copy_to_mode_reg (QImode, arg);
   14649        13261 :       emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
   14650              :     }
   14651              :   else
   14652            6 :     emit_insn (gen_x86_stc ());
   14653        13267 : }
   14654              : 
   14655              : /* Expand an expression EXP that calls a built-in function,
   14656              :    with result going to TARGET if that's convenient
   14657              :    (and in mode MODE if that's convenient).
   14658              :    SUBTARGET may be used as the target for computing one of EXP's operands.
   14659              :    IGNORE is nonzero if the value is to be ignored.  */
   14660              : 
   14661              : rtx
   14662       174055 : ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
   14663              :                      machine_mode mode, int ignore)
   14664              : {
   14665       174055 :   size_t i;
   14666       174055 :   enum insn_code icode, icode2;
   14667       174055 :   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   14668       174055 :   tree arg0, arg1, arg2, arg3, arg4;
   14669       174055 :   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
   14670       174055 :   machine_mode mode0, mode1, mode2, mode3, mode4;
   14671       174055 :   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
   14672       174055 :   HOST_WIDE_INT bisa, bisa2;
   14673              : 
   14674              :   /* For CPU builtins that can be folded, fold first and expand the fold.  */
   14675       174055 :   switch (fcode)
   14676              :     {
   14677          197 :     case IX86_BUILTIN_CPU_INIT:
   14678          197 :       {
   14679              :         /* Make it call __cpu_indicator_init in libgcc.  */
   14680          197 :         tree call_expr, fndecl, type;
   14681          197 :         type = build_function_type_list (integer_type_node, NULL_TREE);
   14682          197 :         fndecl = build_fn_decl ("__cpu_indicator_init", type);
   14683          197 :         call_expr = build_call_expr (fndecl, 0);
   14684          197 :         return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
   14685              :       }
   14686          586 :     case IX86_BUILTIN_CPU_IS:
   14687          586 :     case IX86_BUILTIN_CPU_SUPPORTS:
   14688          586 :       {
   14689          586 :         tree arg0 = CALL_EXPR_ARG (exp, 0);
   14690          586 :         tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
   14691          586 :         gcc_assert (fold_expr != NULL_TREE);
   14692          586 :         return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
   14693              :       }
   14694              :     }
   14695              : 
   14696       173272 :   if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
   14697              :     {
   14698           23 :       bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
   14699           23 :       if (TARGET_ABI_X32)
   14700            0 :         bisa |= OPTION_MASK_ABI_X32;
   14701              :       else
   14702           23 :         bisa |= OPTION_MASK_ABI_64;
   14703           23 :       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
   14704              :                                        (enum fpmath_unit) 0,
   14705              :                                        (enum prefer_vector_width) 0,
   14706              :                                        PVW_NONE, false, add_abi_p);
   14707           23 :       if (!opts)
   14708            0 :         error ("%qE needs unknown isa option", fndecl);
   14709              :       else
   14710              :         {
   14711           23 :           gcc_assert (opts != NULL);
   14712           23 :           error ("%qE needs isa option %s", fndecl, opts);
   14713           23 :           free (opts);
   14714              :         }
   14715           23 :       return expand_call (exp, target, ignore);
   14716              :     }
   14717              : 
   14718       173249 :   switch (fcode)
   14719              :     {
   14720           35 :     case IX86_BUILTIN_MASKMOVQ:
   14721           35 :     case IX86_BUILTIN_MASKMOVDQU:
   14722           34 :       icode = (fcode == IX86_BUILTIN_MASKMOVQ
   14723           35 :                ? CODE_FOR_mmx_maskmovq
   14724              :                : CODE_FOR_sse2_maskmovdqu);
   14725              :       /* Note the arg order is different from the operand order.  */
   14726           35 :       arg1 = CALL_EXPR_ARG (exp, 0);
   14727           35 :       arg2 = CALL_EXPR_ARG (exp, 1);
   14728           35 :       arg0 = CALL_EXPR_ARG (exp, 2);
   14729           35 :       op0 = expand_normal (arg0);
   14730           35 :       op1 = expand_normal (arg1);
   14731           35 :       op2 = expand_normal (arg2);
   14732           35 :       mode0 = insn_data[icode].operand[0].mode;
   14733           35 :       mode1 = insn_data[icode].operand[1].mode;
   14734           35 :       mode2 = insn_data[icode].operand[2].mode;
   14735              : 
   14736           35 :       op0 = ix86_zero_extend_to_Pmode (op0);
   14737           35 :       op0 = gen_rtx_MEM (mode1, op0);
   14738              : 
   14739           35 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   14740            0 :         op0 = copy_to_mode_reg (mode0, op0);
   14741           35 :       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   14742            2 :         op1 = copy_to_mode_reg (mode1, op1);
   14743           35 :       if (!insn_data[icode].operand[2].predicate (op2, mode2))
   14744            2 :         op2 = copy_to_mode_reg (mode2, op2);
   14745           35 :       pat = GEN_FCN (icode) (op0, op1, op2);
   14746           35 :       if (! pat)
   14747        56624 :         return 0;
   14748           35 :       emit_insn (pat);
   14749           35 :       return 0;
   14750              : 
   14751        22008 :     case IX86_BUILTIN_LDMXCSR:
   14752        22008 :       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
   14753        22008 :       target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
   14754        22008 :       emit_move_insn (target, op0);
   14755        22008 :       emit_insn (gen_sse_ldmxcsr (target));
   14756        22008 :       return 0;
   14757              : 
   14758        14785 :     case IX86_BUILTIN_STMXCSR:
   14759        14785 :       target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
   14760        14785 :       emit_insn (gen_sse_stmxcsr (target));
   14761        14785 :       return copy_to_mode_reg (SImode, target);
   14762              : 
   14763           11 :     case IX86_BUILTIN_CLFLUSH:
   14764           11 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14765           11 :         op0 = expand_normal (arg0);
   14766           11 :         icode = CODE_FOR_sse2_clflush;
   14767           11 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14768            5 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14769              : 
   14770           11 :         emit_insn (gen_sse2_clflush (op0));
   14771           11 :         return 0;
   14772              : 
   14773           19 :     case IX86_BUILTIN_CLWB:
   14774           19 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14775           19 :         op0 = expand_normal (arg0);
   14776           19 :         icode = CODE_FOR_clwb;
   14777           19 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14778            9 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14779              : 
   14780           19 :         emit_insn (gen_clwb (op0));
   14781           19 :         return 0;
   14782              : 
   14783           19 :     case IX86_BUILTIN_CLFLUSHOPT:
   14784           19 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14785           19 :         op0 = expand_normal (arg0);
   14786           19 :         icode = CODE_FOR_clflushopt;
   14787           19 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14788            9 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14789              : 
   14790           19 :         emit_insn (gen_clflushopt (op0));
   14791           19 :         return 0;
   14792              : 
   14793           47 :     case IX86_BUILTIN_MONITOR:
   14794           47 :     case IX86_BUILTIN_MONITORX:
   14795           47 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14796           47 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14797           47 :       arg2 = CALL_EXPR_ARG (exp, 2);
   14798           47 :       op0 = expand_normal (arg0);
   14799           47 :       op1 = expand_normal (arg1);
   14800           47 :       op2 = expand_normal (arg2);
   14801           47 :       if (!REG_P (op0))
   14802           19 :         op0 = ix86_zero_extend_to_Pmode (op0);
   14803           47 :       if (!REG_P (op1))
   14804           22 :         op1 = copy_to_mode_reg (SImode, op1);
   14805           47 :       if (!REG_P (op2))
   14806           25 :         op2 = copy_to_mode_reg (SImode, op2);
   14807              : 
   14808           47 :       emit_insn (fcode == IX86_BUILTIN_MONITOR
   14809           26 :                  ? gen_sse3_monitor (Pmode, op0, op1, op2)
   14810           21 :                  : gen_monitorx (Pmode, op0, op1, op2));
   14811           47 :       return 0;
   14812              : 
   14813           25 :     case IX86_BUILTIN_MWAIT:
   14814           25 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14815           25 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14816           25 :       op0 = expand_normal (arg0);
   14817           25 :       op1 = expand_normal (arg1);
   14818           25 :       if (!REG_P (op0))
   14819           13 :         op0 = copy_to_mode_reg (SImode, op0);
   14820           25 :       if (!REG_P (op1))
   14821           11 :         op1 = copy_to_mode_reg (SImode, op1);
   14822           25 :       emit_insn (gen_sse3_mwait (op0, op1));
   14823           25 :       return 0;
   14824              : 
   14825           21 :     case IX86_BUILTIN_MWAITX:
   14826           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14827           21 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14828           21 :       arg2 = CALL_EXPR_ARG (exp, 2);
   14829           21 :       op0 = expand_normal (arg0);
   14830           21 :       op1 = expand_normal (arg1);
   14831           21 :       op2 = expand_normal (arg2);
   14832           21 :       if (!REG_P (op0))
   14833           11 :         op0 = copy_to_mode_reg (SImode, op0);
   14834           21 :       if (!REG_P (op1))
   14835           10 :         op1 = copy_to_mode_reg (SImode, op1);
   14836           21 :       if (!REG_P (op2))
   14837           11 :         op2 = copy_to_mode_reg (SImode, op2);
   14838           21 :       emit_insn (gen_mwaitx (op0, op1, op2));
   14839           21 :       return 0;
   14840              : 
   14841           21 :     case IX86_BUILTIN_UMONITOR:
   14842           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14843           21 :       op0 = expand_normal (arg0);
   14844              : 
   14845           21 :       op0 = ix86_zero_extend_to_Pmode (op0);
   14846           21 :       emit_insn (gen_umonitor (Pmode, op0));
   14847           21 :       return 0;
   14848              : 
   14849           42 :     case IX86_BUILTIN_UMWAIT:
   14850           42 :     case IX86_BUILTIN_TPAUSE:
   14851           42 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14852           42 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14853           42 :       op0 = expand_normal (arg0);
   14854           42 :       op1 = expand_normal (arg1);
   14855              : 
   14856           42 :       if (!REG_P (op0))
   14857           20 :         op0 = copy_to_mode_reg (SImode, op0);
   14858              : 
   14859           42 :       op1 = force_reg (DImode, op1);
   14860              : 
   14861           42 :       if (TARGET_64BIT)
   14862              :         {
   14863           42 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   14864              :                                      NULL, 1, OPTAB_DIRECT);
   14865           42 :           switch (fcode)
   14866              :             {
   14867              :             case IX86_BUILTIN_UMWAIT:
   14868              :               icode = CODE_FOR_umwait_rex64;
   14869              :               break;
   14870           21 :             case IX86_BUILTIN_TPAUSE:
   14871           21 :               icode = CODE_FOR_tpause_rex64;
   14872           21 :               break;
   14873            0 :             default:
   14874            0 :               gcc_unreachable ();
   14875              :             }
   14876              : 
   14877           42 :           op2 = gen_lowpart (SImode, op2);
   14878           42 :           op1 = gen_lowpart (SImode, op1);
   14879           42 :           pat = GEN_FCN (icode) (op0, op1, op2);
   14880              :         }
   14881              :       else
   14882              :         {
   14883            0 :           switch (fcode)
   14884              :             {
   14885              :             case IX86_BUILTIN_UMWAIT:
   14886              :               icode = CODE_FOR_umwait;
   14887              :               break;
   14888            0 :             case IX86_BUILTIN_TPAUSE:
   14889            0 :               icode = CODE_FOR_tpause;
   14890            0 :               break;
   14891            0 :             default:
   14892            0 :               gcc_unreachable ();
   14893              :             }
   14894            0 :           pat = GEN_FCN (icode) (op0, op1);
   14895              :         }
   14896              : 
   14897           42 :       if (!pat)
   14898              :         return 0;
   14899              : 
   14900           42 :       emit_insn (pat);
   14901              : 
   14902           42 :       if (target == 0
   14903           42 :           || !register_operand (target, QImode))
   14904            0 :         target = gen_reg_rtx (QImode);
   14905              : 
   14906           42 :       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   14907              :                         const0_rtx);
   14908           42 :       emit_insn (gen_rtx_SET (target, pat));
   14909              : 
   14910           42 :       return target;
   14911              : 
   14912           20 :     case IX86_BUILTIN_TESTUI:
   14913           20 :       emit_insn (gen_testui ());
   14914              : 
   14915           20 :       if (target == 0
   14916           20 :           || !register_operand (target, QImode))
   14917            0 :         target = gen_reg_rtx (QImode);
   14918              : 
   14919           20 :       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   14920              :                          const0_rtx);
   14921           20 :       emit_insn (gen_rtx_SET (target, pat));
   14922              : 
   14923           20 :       return target;
   14924              : 
   14925           19 :     case IX86_BUILTIN_CLZERO:
   14926           19 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14927           19 :       op0 = expand_normal (arg0);
   14928           19 :       if (!REG_P (op0))
   14929            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   14930           19 :       emit_insn (gen_clzero (Pmode, op0));
   14931           19 :       return 0;
   14932              : 
   14933           19 :     case IX86_BUILTIN_CLDEMOTE:
   14934           19 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14935           19 :       op0 = expand_normal (arg0);
   14936           19 :       icode = CODE_FOR_cldemote;
   14937           19 :       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14938            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   14939              : 
   14940           19 :       emit_insn (gen_cldemote (op0));
   14941           19 :       return 0;
   14942              : 
   14943           11 :     case IX86_BUILTIN_LOADIWKEY:
   14944           11 :       {
   14945           11 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14946           11 :         arg1 = CALL_EXPR_ARG (exp, 1);
   14947           11 :         arg2 = CALL_EXPR_ARG (exp, 2);
   14948           11 :         arg3 = CALL_EXPR_ARG (exp, 3);
   14949              : 
   14950           11 :         op0 = expand_normal (arg0);
   14951           11 :         op1 = expand_normal (arg1);
   14952           11 :         op2 = expand_normal (arg2);
   14953           11 :         op3 = expand_normal (arg3);
   14954              : 
   14955           11 :         if (!REG_P (op0))
   14956            5 :           op0 = copy_to_mode_reg (V2DImode, op0);
   14957           11 :         if (!REG_P (op1))
   14958            5 :           op1 = copy_to_mode_reg (V2DImode, op1);
   14959           11 :         if (!REG_P (op2))
   14960            5 :           op2 = copy_to_mode_reg (V2DImode, op2);
   14961           11 :         if (!REG_P (op3))
   14962            5 :           op3 = copy_to_mode_reg (SImode, op3);
   14963              : 
   14964           11 :         emit_insn (gen_loadiwkey (op0, op1, op2, op3));
   14965              : 
   14966           11 :         return 0;
   14967              :       }
   14968              : 
   14969           12 :     case IX86_BUILTIN_AESDEC128KLU8:
   14970           12 :       icode = CODE_FOR_aesdec128klu8;
   14971           12 :       goto aesdecenc_expand;
   14972              : 
   14973           12 :     case IX86_BUILTIN_AESDEC256KLU8:
   14974           12 :       icode = CODE_FOR_aesdec256klu8;
   14975           12 :       goto aesdecenc_expand;
   14976              : 
   14977           12 :     case IX86_BUILTIN_AESENC128KLU8:
   14978           12 :       icode = CODE_FOR_aesenc128klu8;
   14979           12 :       goto aesdecenc_expand;
   14980              : 
   14981              :     case IX86_BUILTIN_AESENC256KLU8:
   14982              :       icode = CODE_FOR_aesenc256klu8;
   14983              : 
   14984           48 :     aesdecenc_expand:
   14985              : 
   14986           48 :       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
   14987           48 :       arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
   14988           48 :       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
   14989              : 
   14990           48 :       op0 = expand_normal (arg0);
   14991           48 :       op1 = expand_normal (arg1);
   14992           48 :       op2 = expand_normal (arg2);
   14993              : 
   14994           48 :       if (!address_operand (op0, V2DImode))
   14995              :         {
   14996           16 :           op0 = convert_memory_address (Pmode, op0);
   14997           16 :           op0 = copy_addr_to_reg (op0);
   14998              :         }
   14999           48 :       op0 = gen_rtx_MEM (V2DImode, op0);
   15000              : 
   15001           48 :       if (!REG_P (op1))
   15002           20 :         op1 = copy_to_mode_reg (V2DImode, op1);
   15003              : 
   15004           48 :       if (!address_operand (op2, VOIDmode))
   15005              :         {
   15006           16 :           op2 = convert_memory_address (Pmode, op2);
   15007           16 :           op2 = copy_addr_to_reg (op2);
   15008              :         }
   15009           48 :       op2 = gen_rtx_MEM (BLKmode, op2);
   15010              : 
   15011           48 :       emit_insn (GEN_FCN (icode) (op1, op1, op2));
   15012              : 
   15013           48 :       if (target == 0)
   15014            4 :         target = gen_reg_rtx (QImode);
   15015              : 
   15016              :       /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
   15017              :          error occurs. Then the output should be cleared for safety. */
   15018           48 :       rtx_code_label *ok_label;
   15019           48 :       rtx tmp;
   15020              : 
   15021           48 :       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
   15022           48 :       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
   15023           48 :       ok_label = gen_label_rtx ();
   15024           48 :       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
   15025              :                                true, ok_label);
   15026              :       /* Usually the runtime error seldom occur, so predict OK path as
   15027              :          hotspot to optimize it as fallthrough block. */
   15028           48 :       predict_jump (REG_BR_PROB_BASE * 90 / 100);
   15029              : 
   15030           48 :       emit_insn (gen_rtx_SET (op1, const0_rtx));
   15031              : 
   15032           48 :       emit_label (ok_label);
   15033           48 :       emit_insn (gen_rtx_SET (target, pat));
   15034           48 :       emit_insn (gen_rtx_SET (op0, op1));
   15035              : 
   15036           48 :       return target;
   15037              : 
   15038           11 :     case IX86_BUILTIN_AESDECWIDE128KLU8:
   15039           11 :       icode = CODE_FOR_aesdecwide128klu8;
   15040           11 :       goto wideaesdecenc_expand;
   15041              : 
   15042           11 :     case IX86_BUILTIN_AESDECWIDE256KLU8:
   15043           11 :       icode = CODE_FOR_aesdecwide256klu8;
   15044           11 :       goto wideaesdecenc_expand;
   15045              : 
   15046           11 :     case IX86_BUILTIN_AESENCWIDE128KLU8:
   15047           11 :       icode = CODE_FOR_aesencwide128klu8;
   15048           11 :       goto wideaesdecenc_expand;
   15049              : 
   15050              :     case IX86_BUILTIN_AESENCWIDE256KLU8:
   15051              :       icode = CODE_FOR_aesencwide256klu8;
   15052              : 
   15053           44 :     wideaesdecenc_expand:
   15054              : 
   15055           44 :       rtx xmm_regs[8];
   15056           44 :       rtx op;
   15057              : 
   15058           44 :       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
   15059           44 :       arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
   15060           44 :       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
   15061              : 
   15062           44 :       op0 = expand_normal (arg0);
   15063           44 :       op1 = expand_normal (arg1);
   15064           44 :       op2 = expand_normal (arg2);
   15065              : 
   15066           44 :       if (GET_MODE (op1) != Pmode)
   15067            0 :         op1 = convert_to_mode (Pmode, op1, 1);
   15068              : 
   15069           44 :       if (!address_operand (op2, VOIDmode))
   15070              :         {
   15071           16 :           op2 = convert_memory_address (Pmode, op2);
   15072           16 :           op2 = copy_addr_to_reg (op2);
   15073              :         }
   15074           44 :       op2 = gen_rtx_MEM (BLKmode, op2);
   15075              : 
   15076          440 :       for (i = 0; i < 8; i++)
   15077              :         {
   15078          352 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15079              : 
   15080          352 :           op = gen_rtx_MEM (V2DImode,
   15081          352 :                             plus_constant (Pmode, op1, (i * 16)));
   15082              : 
   15083          352 :           emit_move_insn (xmm_regs[i], op);
   15084              :         }
   15085              : 
   15086           44 :       emit_insn (GEN_FCN (icode) (op2));
   15087              : 
   15088           44 :       if (target == 0)
   15089            0 :         target = gen_reg_rtx (QImode);
   15090              : 
   15091           44 :       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
   15092           44 :       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
   15093           44 :       ok_label = gen_label_rtx ();
   15094           44 :       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
   15095              :                                true, ok_label);
   15096           44 :       predict_jump (REG_BR_PROB_BASE * 90 / 100);
   15097              : 
   15098          440 :       for (i = 0; i < 8; i++)
   15099          352 :         emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
   15100              : 
   15101           44 :       emit_label (ok_label);
   15102           44 :       emit_insn (gen_rtx_SET (target, pat));
   15103              : 
   15104           44 :       if (GET_MODE (op0) != Pmode)
   15105            0 :         op0 = convert_to_mode (Pmode, op0, 1);
   15106              : 
   15107          396 :       for (i = 0; i < 8; i++)
   15108              :         {
   15109          352 :           op = gen_rtx_MEM (V2DImode,
   15110          352 :                             plus_constant (Pmode, op0, (i * 16)));
   15111          352 :           emit_move_insn (op, xmm_regs[i]);
   15112              :         }
   15113              : 
   15114              :       return target;
   15115              : 
   15116           13 :     case IX86_BUILTIN_ENCODEKEY128U32:
   15117           13 :       {
   15118           13 :         rtx op, xmm_regs[7];
   15119              : 
   15120           13 :         arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
   15121           13 :         arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
   15122           13 :         arg2 = CALL_EXPR_ARG (exp, 2); // void *h
   15123              : 
   15124           13 :         op0 = expand_normal (arg0);
   15125           13 :         op1 = expand_normal (arg1);
   15126           13 :         op2 = expand_normal (arg2);
   15127              : 
   15128           13 :         if (!REG_P (op0))
   15129            7 :           op0 = copy_to_mode_reg (SImode, op0);
   15130              : 
   15131           13 :         if (GET_MODE (op2) != Pmode)
   15132            1 :           op2 = convert_to_mode (Pmode, op2, 1);
   15133              : 
   15134           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
   15135           13 :         emit_move_insn (op, op1);
   15136              : 
   15137           65 :         for (i = 0; i < 3; i++)
   15138           39 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15139              : 
   15140           13 :         if (target == 0 || !register_operand (target, SImode))
   15141            2 :           target = gen_reg_rtx (SImode);
   15142              : 
   15143           13 :         emit_insn (gen_encodekey128u32 (target, op0));
   15144              : 
   15145           65 :         for (i = 0; i < 3; i++)
   15146              :           {
   15147           39 :             op = gen_rtx_MEM (V2DImode,
   15148           39 :                               plus_constant (Pmode, op2, (i * 16)));
   15149           39 :             emit_move_insn (op, xmm_regs[i]);
   15150              :           }
   15151              : 
   15152           13 :         return target;
   15153              :       }
   15154           13 :     case IX86_BUILTIN_ENCODEKEY256U32:
   15155           13 :       {
   15156           13 :         rtx op, xmm_regs[7];
   15157              : 
   15158           13 :         arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
   15159           13 :         arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
   15160           13 :         arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
   15161           13 :         arg3 = CALL_EXPR_ARG (exp, 3); // void *h
   15162              : 
   15163           13 :         op0 = expand_normal (arg0);
   15164           13 :         op1 = expand_normal (arg1);
   15165           13 :         op2 = expand_normal (arg2);
   15166           13 :         op3 = expand_normal (arg3);
   15167              : 
   15168           13 :         if (!REG_P (op0))
   15169            7 :           op0 = copy_to_mode_reg (SImode, op0);
   15170              : 
   15171           13 :         if (GET_MODE (op3) != Pmode)
   15172            1 :           op3 = convert_to_mode (Pmode, op3, 1);
   15173              : 
   15174              :         /* Force to use xmm0, xmm1 for keylow, keyhi*/
   15175           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
   15176           13 :         emit_move_insn (op, op1);
   15177           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
   15178           13 :         emit_move_insn (op, op2);
   15179              : 
   15180           78 :         for (i = 0; i < 4; i++)
   15181           52 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15182              : 
   15183           13 :         if (target == 0 || !register_operand (target, SImode))
   15184            2 :           target = gen_reg_rtx (SImode);
   15185              : 
   15186           13 :         emit_insn (gen_encodekey256u32 (target, op0));
   15187              : 
   15188           78 :         for (i = 0; i < 4; i++)
   15189              :           {
   15190           52 :             op = gen_rtx_MEM (V2DImode,
   15191           52 :                               plus_constant (Pmode, op3, (i * 16)));
   15192           52 :             emit_move_insn (op, xmm_regs[i]);
   15193              :           }
   15194              : 
   15195           13 :         return target;
   15196              :       }
   15197              : 
   15198           48 :     case IX86_BUILTIN_PREFETCH:
   15199           48 :       {
   15200           48 :         arg0 = CALL_EXPR_ARG (exp, 0); // const void *
   15201           48 :         arg1 = CALL_EXPR_ARG (exp, 1); // const int
   15202           48 :         arg2 = CALL_EXPR_ARG (exp, 2); // const int
   15203           48 :         arg3 = CALL_EXPR_ARG (exp, 3); // const int
   15204              : 
   15205           48 :         op0 = expand_normal (arg0);
   15206           48 :         op1 = expand_normal (arg1);
   15207           48 :         op2 = expand_normal (arg2);
   15208           48 :         op3 = expand_normal (arg3);
   15209              : 
   15210           48 :         if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
   15211              :           {
   15212            0 :             error ("second, third and fourth argument must be a const");
   15213            0 :             return const0_rtx;
   15214              :           }
   15215              : 
   15216           48 :         if (!IN_RANGE (INTVAL (op1), 0, 2))
   15217              :           {
   15218            1 :             warning (0, "invalid second argument to"
   15219              :                      " %<__builtin_ia32_prefetch%>; using zero");
   15220            1 :             op1 = const0_rtx;
   15221              :           }
   15222              : 
   15223           48 :         if (INTVAL (op3) == 1)
   15224              :           {
   15225            4 :             if (!IN_RANGE (INTVAL (op2), 2, 3))
   15226              :               {
   15227            1 :                 error ("invalid third argument");
   15228            1 :                 return const0_rtx;
   15229              :               }
   15230              : 
   15231            3 :             if (TARGET_64BIT && TARGET_PREFETCHI
   15232            6 :                 && local_func_symbolic_operand (op0, GET_MODE (op0)))
   15233            2 :               emit_insn (gen_prefetchi (op0, op2));
   15234              :             else
   15235              :               {
   15236            1 :                 warning (0, "instruction prefetch applies when in 64-bit mode"
   15237              :                             " with RIP-relative addressing and"
   15238              :                             " option %<-mprefetchi%>;"
   15239              :                             " they stay NOPs otherwise");
   15240            1 :                 emit_insn (gen_nop ());
   15241              :               }
   15242              :           }
   15243              :         else
   15244              :           {
   15245           44 :             if (INTVAL (op3) != 0)
   15246            1 :               warning (0, "invalid forth argument to"
   15247              :                           " %<__builtin_ia32_prefetch%>; using zero");
   15248              : 
   15249           44 :             if (!address_operand (op0, VOIDmode))
   15250              :               {
   15251           10 :                 op0 = convert_memory_address (Pmode, op0);
   15252           10 :                 op0 = copy_addr_to_reg (op0);
   15253              :               }
   15254              : 
   15255           44 :             if (!IN_RANGE (INTVAL (op2), 0, 3))
   15256              :               {
   15257            1 :                 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
   15258            1 :                 op2 = const0_rtx;
   15259              :               }
   15260              : 
   15261           44 :             if (TARGET_3DNOW
   15262           26 :                 || TARGET_PREFETCH_SSE
   15263            0 :                 || TARGET_PRFCHW
   15264            0 :                 || TARGET_MOVRS)
   15265           44 :               emit_insn (gen_prefetch (op0, op1, op2));
   15266            0 :             else if (!MEM_P (op0) && side_effects_p (op0))
   15267              :               /* Don't do anything with direct references to volatile memory,
   15268              :                  but generate code to handle other side effects.  */
   15269            0 :               emit_insn (op0);
   15270              :           }
   15271              : 
   15272              :         return 0;
   15273              :       }
   15274              : 
   15275           21 :     case IX86_BUILTIN_PREFETCHI:
   15276           21 :       {
   15277           21 :         arg0 = CALL_EXPR_ARG (exp, 0); // const void *
   15278           21 :         arg1 = CALL_EXPR_ARG (exp, 1); // const int
   15279              : 
   15280           21 :         op0 = expand_normal (arg0);
   15281           21 :         op1 = expand_normal (arg1);
   15282              : 
   15283           21 :         if (!CONST_INT_P (op1))
   15284              :           {
   15285            0 :             error ("second argument must be a const");
   15286            0 :             return const0_rtx;
   15287              :           }
   15288              : 
   15289              :         /* GOT/PLT_PIC should not be available for instruction prefetch.
   15290              :            It must be real instruction address.  */
   15291           21 :         if (TARGET_64BIT
   15292           21 :             && local_func_symbolic_operand (op0, GET_MODE (op0)))
   15293            4 :           emit_insn (gen_prefetchi (op0, op1));
   15294              :         else
   15295              :           {
   15296              :             /* Ignore the hint.  */
   15297           17 :             warning (0, "instruction prefetch applies when in 64-bit mode"
   15298              :                         " with RIP-relative addressing and"
   15299              :                         " option %<-mprefetchi%>;"
   15300              :                         " they stay NOPs otherwise");
   15301           17 :             emit_insn (gen_nop ());
   15302              :           }
   15303              : 
   15304              :         return 0;
   15305              :       }
   15306              : 
   15307           53 :     case IX86_BUILTIN_URDMSR:
   15308           53 :     case IX86_BUILTIN_UWRMSR:
   15309           53 :       {
   15310           53 :         arg0 = CALL_EXPR_ARG (exp, 0);
   15311           53 :         op0 = expand_normal (arg0);
   15312              : 
   15313           53 :         if (CONST_INT_P (op0))
   15314              :           {
   15315           12 :             unsigned HOST_WIDE_INT val = UINTVAL (op0);
   15316           12 :             if (val > 0xffffffff)
   15317            2 :               op0 = force_reg (DImode, op0);
   15318              :           }
   15319              :         else
   15320           41 :           op0 = force_reg (DImode, op0);
   15321              : 
   15322           53 :         if (fcode == IX86_BUILTIN_UWRMSR)
   15323              :           {
   15324           26 :             arg1 = CALL_EXPR_ARG (exp, 1);
   15325           26 :             op1 = expand_normal (arg1);
   15326           26 :             op1 = force_reg (DImode, op1);
   15327           26 :             icode = CODE_FOR_uwrmsr;
   15328           26 :             target = 0;
   15329              :           }
   15330              :         else
   15331              :           {
   15332           27 :             if (target == 0 || !register_operand (target, DImode))
   15333            1 :               target = gen_reg_rtx (DImode);
   15334              :             icode = CODE_FOR_urdmsr;
   15335              :             op1 = op0;
   15336              :             op0 = target;
   15337              :           }
   15338           53 :         emit_insn (GEN_FCN (icode) (op0, op1));
   15339           53 :         return target;
   15340              :       }
   15341              : 
   15342          229 :     case IX86_BUILTIN_VEC_INIT_V2SI:
   15343          229 :     case IX86_BUILTIN_VEC_INIT_V4HI:
   15344          229 :     case IX86_BUILTIN_VEC_INIT_V8QI:
   15345          229 :       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
   15346              : 
   15347          400 :     case IX86_BUILTIN_VEC_EXT_V2DF:
   15348          400 :     case IX86_BUILTIN_VEC_EXT_V2DI:
   15349          400 :     case IX86_BUILTIN_VEC_EXT_V4SF:
   15350          400 :     case IX86_BUILTIN_VEC_EXT_V4SI:
   15351          400 :     case IX86_BUILTIN_VEC_EXT_V8HI:
   15352          400 :     case IX86_BUILTIN_VEC_EXT_V2SI:
   15353          400 :     case IX86_BUILTIN_VEC_EXT_V4HI:
   15354          400 :     case IX86_BUILTIN_VEC_EXT_V16QI:
   15355          400 :       return ix86_expand_vec_ext_builtin (exp, target);
   15356              : 
   15357          204 :     case IX86_BUILTIN_VEC_SET_V2DI:
   15358          204 :     case IX86_BUILTIN_VEC_SET_V4SF:
   15359          204 :     case IX86_BUILTIN_VEC_SET_V4SI:
   15360          204 :     case IX86_BUILTIN_VEC_SET_V8HI:
   15361          204 :     case IX86_BUILTIN_VEC_SET_V4HI:
   15362          204 :     case IX86_BUILTIN_VEC_SET_V16QI:
   15363          204 :       return ix86_expand_vec_set_builtin (exp);
   15364              : 
   15365            0 :     case IX86_BUILTIN_NANQ:
   15366            0 :     case IX86_BUILTIN_NANSQ:
   15367            0 :       return expand_call (exp, target, ignore);
   15368              : 
   15369           18 :     case IX86_BUILTIN_RDPID:
   15370              : 
   15371           18 :       op0 = gen_reg_rtx (word_mode);
   15372              : 
   15373           18 :       if (TARGET_64BIT)
   15374              :         {
   15375           18 :           insn = gen_rdpid_rex64 (op0);
   15376           18 :           op0 = convert_to_mode (SImode, op0, 1);
   15377              :         }
   15378              :       else
   15379            0 :         insn = gen_rdpid (op0);
   15380              : 
   15381           18 :       emit_insn (insn);
   15382              : 
   15383           18 :       if (target == 0
   15384           18 :           || !register_operand (target, SImode))
   15385            0 :         target = gen_reg_rtx (SImode);
   15386              : 
   15387           18 :       emit_move_insn (target, op0);
   15388           18 :       return target;
   15389              : 
   15390           76 :     case IX86_BUILTIN_2INTERSECTD512:
   15391           76 :     case IX86_BUILTIN_2INTERSECTQ512:
   15392           76 :     case IX86_BUILTIN_2INTERSECTD256:
   15393           76 :     case IX86_BUILTIN_2INTERSECTQ256:
   15394           76 :     case IX86_BUILTIN_2INTERSECTD128:
   15395           76 :     case IX86_BUILTIN_2INTERSECTQ128:
   15396           76 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15397           76 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15398           76 :       arg2 = CALL_EXPR_ARG (exp, 2);
   15399           76 :       arg3 = CALL_EXPR_ARG (exp, 3);
   15400           76 :       op0 = expand_normal (arg0);
   15401           76 :       op1 = expand_normal (arg1);
   15402           76 :       op2 = expand_normal (arg2);
   15403           76 :       op3 = expand_normal (arg3);
   15404              : 
   15405           76 :       if (!address_operand (op0, VOIDmode))
   15406              :         {
   15407           26 :           op0 = convert_memory_address (Pmode, op0);
   15408           26 :           op0 = copy_addr_to_reg (op0);
   15409              :         }
   15410           76 :       if (!address_operand (op1, VOIDmode))
   15411              :         {
   15412           26 :           op1 = convert_memory_address (Pmode, op1);
   15413           26 :           op1 = copy_addr_to_reg (op1);
   15414              :         }
   15415              : 
   15416           76 :       switch (fcode)
   15417              :         {
   15418              :         case IX86_BUILTIN_2INTERSECTD512:
   15419              :           mode4 = P2HImode;
   15420              :           icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
   15421              :           break;
   15422              :         case IX86_BUILTIN_2INTERSECTQ512:
   15423              :           mode4 = P2QImode;
   15424              :           icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
   15425              :           break;
   15426              :         case IX86_BUILTIN_2INTERSECTD256:
   15427              :           mode4 = P2QImode;
   15428              :           icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
   15429              :           break;
   15430              :         case IX86_BUILTIN_2INTERSECTQ256:
   15431              :           mode4 = P2QImode;
   15432              :           icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
   15433              :           break;
   15434              :         case IX86_BUILTIN_2INTERSECTD128:
   15435              :           mode4 = P2QImode;
   15436              :           icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
   15437              :           break;
   15438              :         case IX86_BUILTIN_2INTERSECTQ128:
   15439              :           mode4 = P2QImode;
   15440              :           icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
   15441              :           break;
   15442            0 :         default:
   15443            0 :           gcc_unreachable ();
   15444              :         }
   15445              : 
   15446           76 :       mode2 = insn_data[icode].operand[1].mode;
   15447           76 :       mode3 = insn_data[icode].operand[2].mode;
   15448           76 :       if (!insn_data[icode].operand[1].predicate (op2, mode2))
   15449           26 :         op2 = copy_to_mode_reg (mode2, op2);
   15450           76 :       if (!insn_data[icode].operand[2].predicate (op3, mode3))
   15451            6 :         op3 = copy_to_mode_reg (mode3, op3);
   15452              : 
   15453           76 :       op4 = gen_reg_rtx (mode4);
   15454           76 :       emit_insn (GEN_FCN (icode) (op4, op2, op3));
   15455           76 :       mode0 = mode4 == P2HImode ? HImode : QImode;
   15456           76 :       emit_move_insn (gen_rtx_MEM (mode0, op0),
   15457           76 :                       gen_lowpart (mode0, op4));
   15458           76 :       emit_move_insn (gen_rtx_MEM (mode0, op1),
   15459              :                       gen_highpart (mode0, op4));
   15460              : 
   15461           76 :       return 0;
   15462              : 
   15463          102 :     case IX86_BUILTIN_RDPMC:
   15464          102 :     case IX86_BUILTIN_RDTSC:
   15465          102 :     case IX86_BUILTIN_RDTSCP:
   15466          102 :     case IX86_BUILTIN_XGETBV:
   15467              : 
   15468          102 :       op0 = gen_reg_rtx (DImode);
   15469          102 :       op1 = gen_reg_rtx (DImode);
   15470              : 
   15471          102 :       if (fcode == IX86_BUILTIN_RDPMC)
   15472              :         {
   15473           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15474           22 :           op2 = expand_normal (arg0);
   15475           22 :           if (!register_operand (op2, SImode))
   15476           11 :             op2 = copy_to_mode_reg (SImode, op2);
   15477              : 
   15478           22 :           insn = (TARGET_64BIT
   15479           22 :                   ? gen_rdpmc_rex64 (op0, op1, op2)
   15480            0 :                   : gen_rdpmc (op0, op2));
   15481           22 :           emit_insn (insn);
   15482              :         }
   15483           80 :       else if (fcode == IX86_BUILTIN_XGETBV)
   15484              :         {
   15485           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15486           22 :           op2 = expand_normal (arg0);
   15487           22 :           if (!register_operand (op2, SImode))
   15488            1 :             op2 = copy_to_mode_reg (SImode, op2);
   15489              : 
   15490           22 :           insn = (TARGET_64BIT
   15491           22 :                   ? gen_xgetbv_rex64 (op0, op1, op2)
   15492            0 :                   : gen_xgetbv (op0, op2));
   15493           22 :           emit_insn (insn);
   15494              :         }
   15495           58 :       else if (fcode == IX86_BUILTIN_RDTSC)
   15496              :         {
   15497           36 :           insn = (TARGET_64BIT
   15498           36 :                   ? gen_rdtsc_rex64 (op0, op1)
   15499            2 :                   : gen_rdtsc (op0));
   15500           36 :           emit_insn (insn);
   15501              :         }
   15502              :       else
   15503              :         {
   15504           22 :           op2 = gen_reg_rtx (SImode);
   15505              : 
   15506           22 :           insn = (TARGET_64BIT
   15507           22 :                   ? gen_rdtscp_rex64 (op0, op1, op2)
   15508            0 :                   : gen_rdtscp (op0, op2));
   15509           22 :           emit_insn (insn);
   15510              : 
   15511           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15512           22 :           op4 = expand_normal (arg0);
   15513           22 :           if (!address_operand (op4, VOIDmode))
   15514              :             {
   15515           10 :               op4 = convert_memory_address (Pmode, op4);
   15516           10 :               op4 = copy_addr_to_reg (op4);
   15517              :             }
   15518           22 :           emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
   15519              :         }
   15520              : 
   15521          102 :       if (target == 0
   15522          102 :           || !register_operand (target, DImode))
   15523           10 :         target = gen_reg_rtx (DImode);
   15524              : 
   15525          102 :       if (TARGET_64BIT)
   15526              :         {
   15527          100 :           op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
   15528              :                                      op1, 1, OPTAB_DIRECT);
   15529          100 :           op0 = expand_simple_binop (DImode, IOR, op0, op1,
   15530              :                                      op0, 1, OPTAB_DIRECT);
   15531              :         }
   15532              : 
   15533          102 :       emit_move_insn (target, op0);
   15534          102 :       return target;
   15535              : 
   15536           61 :     case IX86_BUILTIN_ENQCMD:
   15537           61 :     case IX86_BUILTIN_ENQCMDS:
   15538           61 :     case IX86_BUILTIN_MOVDIR64B:
   15539              : 
   15540           61 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15541           61 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15542           61 :       op0 = expand_normal (arg0);
   15543           61 :       op1 = expand_normal (arg1);
   15544              : 
   15545           61 :       op0 = ix86_zero_extend_to_Pmode (op0);
   15546           61 :       if (!address_operand (op1, VOIDmode))
   15547              :       {
   15548           28 :         op1 = convert_memory_address (Pmode, op1);
   15549           28 :         op1 = copy_addr_to_reg (op1);
   15550              :       }
   15551           61 :       op1 = gen_rtx_MEM (XImode, op1);
   15552              : 
   15553           61 :       if (fcode == IX86_BUILTIN_MOVDIR64B)
   15554              :         {
   15555           24 :           emit_insn (gen_movdir64b (Pmode, op0, op1));
   15556           23 :           return 0;
   15557              :         }
   15558              :       else
   15559              :         {
   15560           38 :           if (target == 0
   15561           38 :               || !register_operand (target, SImode))
   15562            0 :             target = gen_reg_rtx (SImode);
   15563              : 
   15564           38 :           emit_move_insn (target, const0_rtx);
   15565           38 :           target = gen_rtx_SUBREG (QImode, target, 0);
   15566              : 
   15567           19 :           int unspecv = (fcode == IX86_BUILTIN_ENQCMD
   15568           38 :                          ? UNSPECV_ENQCMD
   15569              :                          : UNSPECV_ENQCMDS);
   15570           38 :           icode = code_for_enqcmd (unspecv, Pmode);
   15571           38 :           emit_insn (GEN_FCN (icode) (op0, op1));
   15572              : 
   15573           38 :           emit_insn
   15574           38 :             (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   15575              :                           gen_rtx_fmt_ee (EQ, QImode,
   15576              :                                           gen_rtx_REG (CCZmode, FLAGS_REG),
   15577              :                                           const0_rtx)));
   15578           38 :           return SUBREG_REG (target);
   15579              :         }
   15580              : 
   15581        14775 :     case IX86_BUILTIN_FXSAVE:
   15582        14775 :     case IX86_BUILTIN_FXRSTOR:
   15583        14775 :     case IX86_BUILTIN_FXSAVE64:
   15584        14775 :     case IX86_BUILTIN_FXRSTOR64:
   15585        14775 :     case IX86_BUILTIN_FNSTENV:
   15586        14775 :     case IX86_BUILTIN_FLDENV:
   15587        14775 :       mode0 = BLKmode;
   15588        14775 :       switch (fcode)
   15589              :         {
   15590              :         case IX86_BUILTIN_FXSAVE:
   15591              :           icode = CODE_FOR_fxsave;
   15592              :           break;
   15593           19 :         case IX86_BUILTIN_FXRSTOR:
   15594           19 :           icode = CODE_FOR_fxrstor;
   15595           19 :           break;
   15596           23 :         case IX86_BUILTIN_FXSAVE64:
   15597           23 :           icode = CODE_FOR_fxsave64;
   15598           23 :           break;
   15599           21 :         case IX86_BUILTIN_FXRSTOR64:
   15600           21 :           icode = CODE_FOR_fxrstor64;
   15601           21 :           break;
   15602         7257 :         case IX86_BUILTIN_FNSTENV:
   15603         7257 :           icode = CODE_FOR_fnstenv;
   15604         7257 :           break;
   15605         7435 :         case IX86_BUILTIN_FLDENV:
   15606         7435 :           icode = CODE_FOR_fldenv;
   15607         7435 :           break;
   15608            0 :         default:
   15609            0 :           gcc_unreachable ();
   15610              :         }
   15611              : 
   15612        14775 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15613        14775 :       op0 = expand_normal (arg0);
   15614              : 
   15615        14775 :       if (!address_operand (op0, VOIDmode))
   15616              :         {
   15617           36 :           op0 = convert_memory_address (Pmode, op0);
   15618           36 :           op0 = copy_addr_to_reg (op0);
   15619              :         }
   15620        14775 :       op0 = gen_rtx_MEM (mode0, op0);
   15621              : 
   15622        14775 :       pat = GEN_FCN (icode) (op0);
   15623        14775 :       if (pat)
   15624        14775 :         emit_insn (pat);
   15625              :       return 0;
   15626              : 
   15627           21 :     case IX86_BUILTIN_XSETBV:
   15628           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15629           21 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15630           21 :       op0 = expand_normal (arg0);
   15631           21 :       op1 = expand_normal (arg1);
   15632              : 
   15633           21 :       if (!REG_P (op0))
   15634            1 :         op0 = copy_to_mode_reg (SImode, op0);
   15635              : 
   15636           21 :       op1 = force_reg (DImode, op1);
   15637              : 
   15638           21 :       if (TARGET_64BIT)
   15639              :         {
   15640           21 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   15641              :                                      NULL, 1, OPTAB_DIRECT);
   15642              : 
   15643           21 :           icode = CODE_FOR_xsetbv_rex64;
   15644              : 
   15645           21 :           op2 = gen_lowpart (SImode, op2);
   15646           21 :           op1 = gen_lowpart (SImode, op1);
   15647           21 :           pat = GEN_FCN (icode) (op0, op1, op2);
   15648              :         }
   15649              :       else
   15650              :         {
   15651            0 :           icode = CODE_FOR_xsetbv;
   15652              : 
   15653            0 :           pat = GEN_FCN (icode) (op0, op1);
   15654              :         }
   15655           21 :       if (pat)
   15656           21 :         emit_insn (pat);
   15657              :       return 0;
   15658              : 
   15659          232 :     case IX86_BUILTIN_XSAVE:
   15660          232 :     case IX86_BUILTIN_XRSTOR:
   15661          232 :     case IX86_BUILTIN_XSAVE64:
   15662          232 :     case IX86_BUILTIN_XRSTOR64:
   15663          232 :     case IX86_BUILTIN_XSAVEOPT:
   15664          232 :     case IX86_BUILTIN_XSAVEOPT64:
   15665          232 :     case IX86_BUILTIN_XSAVES:
   15666          232 :     case IX86_BUILTIN_XRSTORS:
   15667          232 :     case IX86_BUILTIN_XSAVES64:
   15668          232 :     case IX86_BUILTIN_XRSTORS64:
   15669          232 :     case IX86_BUILTIN_XSAVEC:
   15670          232 :     case IX86_BUILTIN_XSAVEC64:
   15671          232 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15672          232 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15673          232 :       op0 = expand_normal (arg0);
   15674          232 :       op1 = expand_normal (arg1);
   15675              : 
   15676          232 :       if (!address_operand (op0, VOIDmode))
   15677              :         {
   15678          108 :           op0 = convert_memory_address (Pmode, op0);
   15679          108 :           op0 = copy_addr_to_reg (op0);
   15680              :         }
   15681          232 :       op0 = gen_rtx_MEM (BLKmode, op0);
   15682              : 
   15683          232 :       op1 = force_reg (DImode, op1);
   15684              : 
   15685          232 :       if (TARGET_64BIT)
   15686              :         {
   15687          232 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   15688              :                                      NULL, 1, OPTAB_DIRECT);
   15689          232 :           switch (fcode)
   15690              :             {
   15691              :             case IX86_BUILTIN_XSAVE:
   15692              :               icode = CODE_FOR_xsave_rex64;
   15693              :               break;
   15694           19 :             case IX86_BUILTIN_XRSTOR:
   15695           19 :               icode = CODE_FOR_xrstor_rex64;
   15696           19 :               break;
   15697           21 :             case IX86_BUILTIN_XSAVE64:
   15698           21 :               icode = CODE_FOR_xsave64;
   15699           21 :               break;
   15700           21 :             case IX86_BUILTIN_XRSTOR64:
   15701           21 :               icode = CODE_FOR_xrstor64;
   15702           21 :               break;
   15703           19 :             case IX86_BUILTIN_XSAVEOPT:
   15704           19 :               icode = CODE_FOR_xsaveopt_rex64;
   15705           19 :               break;
   15706           19 :             case IX86_BUILTIN_XSAVEOPT64:
   15707           19 :               icode = CODE_FOR_xsaveopt64;
   15708           19 :               break;
   15709           19 :             case IX86_BUILTIN_XSAVES:
   15710           19 :               icode = CODE_FOR_xsaves_rex64;
   15711           19 :               break;
   15712           19 :             case IX86_BUILTIN_XRSTORS:
   15713           19 :               icode = CODE_FOR_xrstors_rex64;
   15714           19 :               break;
   15715           19 :             case IX86_BUILTIN_XSAVES64:
   15716           19 :               icode = CODE_FOR_xsaves64;
   15717           19 :               break;
   15718           19 :             case IX86_BUILTIN_XRSTORS64:
   15719           19 :               icode = CODE_FOR_xrstors64;
   15720           19 :               break;
   15721           19 :             case IX86_BUILTIN_XSAVEC:
   15722           19 :               icode = CODE_FOR_xsavec_rex64;
   15723           19 :               break;
   15724           19 :             case IX86_BUILTIN_XSAVEC64:
   15725           19 :               icode = CODE_FOR_xsavec64;
   15726           19 :               break;
   15727            0 :             default:
   15728            0 :               gcc_unreachable ();
   15729              :             }
   15730              : 
   15731          232 :           op2 = gen_lowpart (SImode, op2);
   15732          232 :           op1 = gen_lowpart (SImode, op1);
   15733          232 :           pat = GEN_FCN (icode) (op0, op1, op2);
   15734              :         }
   15735              :       else
   15736              :         {
   15737            0 :           switch (fcode)
   15738              :             {
   15739              :             case IX86_BUILTIN_XSAVE:
   15740              :               icode = CODE_FOR_xsave;
   15741              :               break;
   15742              :             case IX86_BUILTIN_XRSTOR:
   15743              :               icode = CODE_FOR_xrstor;
   15744              :               break;
   15745              :             case IX86_BUILTIN_XSAVEOPT:
   15746              :               icode = CODE_FOR_xsaveopt;
   15747              :               break;
   15748              :             case IX86_BUILTIN_XSAVES:
   15749              :               icode = CODE_FOR_xsaves;
   15750              :               break;
   15751              :             case IX86_BUILTIN_XRSTORS:
   15752              :               icode = CODE_FOR_xrstors;
   15753              :               break;
   15754              :             case IX86_BUILTIN_XSAVEC:
   15755              :               icode = CODE_FOR_xsavec;
   15756              :               break;
   15757            0 :             default:
   15758            0 :               gcc_unreachable ();
   15759              :             }
   15760            0 :           pat = GEN_FCN (icode) (op0, op1);
   15761              :         }
   15762              : 
   15763          232 :       if (pat)
   15764          232 :         emit_insn (pat);
   15765              :       return 0;
   15766              : 
   15767          144 :     case IX86_BUILTIN_LDTILECFG:
   15768          144 :     case IX86_BUILTIN_STTILECFG:
   15769          144 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15770          144 :       op0 = expand_normal (arg0);
   15771              : 
   15772          144 :       if (!address_operand (op0, VOIDmode))
   15773              :         {
   15774            8 :           op0 = convert_memory_address (Pmode, op0);
   15775            8 :           op0 = copy_addr_to_reg (op0);
   15776              :         }
   15777          144 :       op0 = gen_rtx_MEM (BLKmode, op0);
   15778          144 :       if (fcode == IX86_BUILTIN_LDTILECFG)
   15779              :         icode = CODE_FOR_ldtilecfg;
   15780              :       else
   15781           93 :         icode = CODE_FOR_sttilecfg;
   15782          144 :       pat = GEN_FCN (icode) (op0);
   15783          144 :       emit_insn (pat);
   15784          144 :       return 0;
   15785              : 
   15786           18 :     case IX86_BUILTIN_LLWPCB:
   15787           18 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15788           18 :       op0 = expand_normal (arg0);
   15789              : 
   15790           18 :       if (!register_operand (op0, Pmode))
   15791            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   15792           18 :       emit_insn (gen_lwp_llwpcb (Pmode, op0));
   15793           18 :       return 0;
   15794              : 
   15795           18 :     case IX86_BUILTIN_SLWPCB:
   15796           18 :       if (!target
   15797           18 :           || !register_operand (target, Pmode))
   15798            0 :         target = gen_reg_rtx (Pmode);
   15799           18 :       emit_insn (gen_lwp_slwpcb (Pmode, target));
   15800           18 :       return target;
   15801              : 
   15802           51 :     case IX86_BUILTIN_LWPVAL32:
   15803           51 :     case IX86_BUILTIN_LWPVAL64:
   15804           51 :     case IX86_BUILTIN_LWPINS32:
   15805           51 :     case IX86_BUILTIN_LWPINS64:
   15806           51 :       mode = ((fcode == IX86_BUILTIN_LWPVAL32
   15807           51 :                || fcode == IX86_BUILTIN_LWPINS32)
   15808           51 :               ? SImode : DImode);
   15809              : 
   15810           51 :       if (fcode == IX86_BUILTIN_LWPVAL32
   15811           51 :           || fcode == IX86_BUILTIN_LWPVAL64)
   15812           26 :         icode = code_for_lwp_lwpval (mode);
   15813              :       else
   15814           25 :         icode = code_for_lwp_lwpins (mode);
   15815              : 
   15816           51 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15817           51 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15818           51 :       arg2 = CALL_EXPR_ARG (exp, 2);
   15819           51 :       op0 = expand_normal (arg0);
   15820           51 :       op1 = expand_normal (arg1);
   15821           51 :       op2 = expand_normal (arg2);
   15822           51 :       mode0 = insn_data[icode].operand[0].mode;
   15823              : 
   15824           51 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   15825           13 :         op0 = copy_to_mode_reg (mode0, op0);
   15826           51 :       if (!insn_data[icode].operand[1].predicate (op1, SImode))
   15827            0 :         op1 = copy_to_mode_reg (SImode, op1);
   15828              : 
   15829           51 :       if (!CONST_INT_P (op2))
   15830              :         {
   15831            0 :           error ("the last argument must be a 32-bit immediate");
   15832            0 :           return const0_rtx;
   15833              :         }
   15834              : 
   15835           51 :       emit_insn (GEN_FCN (icode) (op0, op1, op2));
   15836              : 
   15837           51 :       if (fcode == IX86_BUILTIN_LWPINS32
   15838           51 :           || fcode == IX86_BUILTIN_LWPINS64)
   15839              :         {
   15840           25 :           if (target == 0
   15841           25 :               || !nonimmediate_operand (target, QImode))
   15842            0 :             target = gen_reg_rtx (QImode);
   15843              : 
   15844           25 :           pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   15845              :                             const0_rtx);
   15846           25 :           emit_insn (gen_rtx_SET (target, pat));
   15847              : 
   15848           25 :           return target;
   15849              :         }
   15850              :       else
   15851              :         return 0;
   15852              : 
   15853           18 :     case IX86_BUILTIN_BEXTRI32:
   15854           18 :     case IX86_BUILTIN_BEXTRI64:
   15855           18 :       mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
   15856              : 
   15857           18 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15858           18 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15859           18 :       op0 = expand_normal (arg0);
   15860           18 :       op1 = expand_normal (arg1);
   15861              : 
   15862           18 :       if (!CONST_INT_P (op1))
   15863              :         {
   15864            0 :           error ("last argument must be an immediate");
   15865            0 :           return const0_rtx;
   15866              :         }
   15867              :       else
   15868              :         {
   15869           18 :           unsigned char lsb_index = UINTVAL (op1);
   15870           18 :           unsigned char length = UINTVAL (op1) >> 8;
   15871              : 
   15872           18 :           unsigned char bitsize = GET_MODE_BITSIZE (mode);
   15873              : 
   15874           18 :           icode = code_for_tbm_bextri (mode);
   15875              : 
   15876           18 :           mode1 = insn_data[icode].operand[1].mode;
   15877           18 :           if (!insn_data[icode].operand[1].predicate (op0, mode1))
   15878           12 :             op0 = copy_to_mode_reg (mode1, op0);
   15879              : 
   15880           18 :           mode0 = insn_data[icode].operand[0].mode;
   15881           18 :           if (target == 0
   15882           18 :               || !register_operand (target, mode0))
   15883            0 :             target = gen_reg_rtx (mode0);
   15884              : 
   15885           18 :           if (length == 0 || lsb_index >= bitsize)
   15886              :             {
   15887            8 :               emit_move_insn (target, const0_rtx);
   15888            8 :               return target;
   15889              :             }
   15890              : 
   15891           10 :           if (length + lsb_index > bitsize)
   15892            5 :             length = bitsize - lsb_index;
   15893              : 
   15894           10 :           op1 = GEN_INT (length);
   15895           10 :           op2 = GEN_INT (lsb_index);
   15896              : 
   15897           10 :           emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
   15898           10 :           return target;
   15899              :         }
   15900              : 
   15901           21 :     case IX86_BUILTIN_RDRAND16_STEP:
   15902           21 :       mode = HImode;
   15903           21 :       goto rdrand_step;
   15904              : 
   15905           42 :     case IX86_BUILTIN_RDRAND32_STEP:
   15906           42 :       mode = SImode;
   15907           42 :       goto rdrand_step;
   15908              : 
   15909              :     case IX86_BUILTIN_RDRAND64_STEP:
   15910              :       mode = DImode;
   15911              : 
   15912           83 : rdrand_step:
   15913           83 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15914           83 :       op1 = expand_normal (arg0);
   15915           83 :       if (!address_operand (op1, VOIDmode))
   15916              :         {
   15917           29 :           op1 = convert_memory_address (Pmode, op1);
   15918           29 :           op1 = copy_addr_to_reg (op1);
   15919              :         }
   15920              : 
   15921           83 :       op0 = gen_reg_rtx (mode);
   15922           83 :       emit_insn (gen_rdrand (mode, op0));
   15923              : 
   15924           83 :       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
   15925              : 
   15926           83 :       op1 = force_reg (SImode, const1_rtx);
   15927              : 
   15928              :       /* Emit SImode conditional move.  */
   15929           83 :       if (mode == HImode)
   15930              :         {
   15931           21 :           if (TARGET_ZERO_EXTEND_WITH_AND
   15932           21 :               && optimize_function_for_speed_p (cfun))
   15933              :             {
   15934            0 :               op2 = force_reg (SImode, const0_rtx);
   15935              : 
   15936            0 :               emit_insn (gen_movstricthi
   15937            0 :                          (gen_lowpart (HImode, op2), op0));
   15938              :             }
   15939              :           else
   15940              :             {
   15941           21 :               op2 = gen_reg_rtx (SImode);
   15942              : 
   15943           21 :               emit_insn (gen_zero_extendhisi2 (op2, op0));
   15944              :             }
   15945              :         }
   15946           62 :       else if (mode == SImode)
   15947              :         op2 = op0;
   15948              :       else
   15949           20 :         op2 = gen_rtx_SUBREG (SImode, op0, 0);
   15950              : 
   15951           83 :       if (target == 0
   15952           83 :           || !register_operand (target, SImode))
   15953            7 :         target = gen_reg_rtx (SImode);
   15954              : 
   15955           83 :       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
   15956              :                          const0_rtx);
   15957           83 :       emit_insn (gen_rtx_SET (target,
   15958              :                               gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
   15959           83 :       return target;
   15960              : 
   15961           19 :     case IX86_BUILTIN_RDSEED16_STEP:
   15962           19 :       mode = HImode;
   15963           19 :       goto rdseed_step;
   15964              : 
   15965           28 :     case IX86_BUILTIN_RDSEED32_STEP:
   15966           28 :       mode = SImode;
   15967           28 :       goto rdseed_step;
   15968              : 
   15969              :     case IX86_BUILTIN_RDSEED64_STEP:
   15970              :       mode = DImode;
   15971              : 
   15972           66 : rdseed_step:
   15973           66 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15974           66 :       op1 = expand_normal (arg0);
   15975           66 :       if (!address_operand (op1, VOIDmode))
   15976              :         {
   15977           28 :           op1 = convert_memory_address (Pmode, op1);
   15978           28 :           op1 = copy_addr_to_reg (op1);
   15979              :         }
   15980              : 
   15981           66 :       op0 = gen_reg_rtx (mode);
   15982           66 :       emit_insn (gen_rdseed (mode, op0));
   15983              : 
   15984           66 :       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
   15985              : 
   15986           66 :       op2 = gen_reg_rtx (QImode);
   15987              : 
   15988           66 :       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   15989              :                          const0_rtx);
   15990           66 :       emit_insn (gen_rtx_SET (op2, pat));
   15991              : 
   15992           66 :       if (target == 0
   15993           66 :           || !register_operand (target, SImode))
   15994            1 :         target = gen_reg_rtx (SImode);
   15995              : 
   15996           66 :       emit_insn (gen_zero_extendqisi2 (target, op2));
   15997           66 :       return target;
   15998              : 
   15999           38 :     case IX86_BUILTIN_SBB32:
   16000           38 :       icode = CODE_FOR_subborrowsi;
   16001           38 :       icode2 = CODE_FOR_subborrowsi_0;
   16002           38 :       mode0 = SImode;
   16003           38 :       mode1 = DImode;
   16004           38 :       mode2 = CCmode;
   16005           38 :       goto handlecarry;
   16006              : 
   16007           44 :     case IX86_BUILTIN_SBB64:
   16008           44 :       icode = CODE_FOR_subborrowdi;
   16009           44 :       icode2 = CODE_FOR_subborrowdi_0;
   16010           44 :       mode0 = DImode;
   16011           44 :       mode1 = TImode;
   16012           44 :       mode2 = CCmode;
   16013           44 :       goto handlecarry;
   16014              : 
   16015           69 :     case IX86_BUILTIN_ADDCARRYX32:
   16016           69 :       icode = CODE_FOR_addcarrysi;
   16017           69 :       icode2 = CODE_FOR_addcarrysi_0;
   16018           69 :       mode0 = SImode;
   16019           69 :       mode1 = DImode;
   16020           69 :       mode2 = CCCmode;
   16021           69 :       goto handlecarry;
   16022              : 
   16023              :     case IX86_BUILTIN_ADDCARRYX64:
   16024              :       icode = CODE_FOR_addcarrydi;
   16025              :       icode2 = CODE_FOR_addcarrydi_0;
   16026              :       mode0 = DImode;
   16027              :       mode1 = TImode;
   16028              :       mode2 = CCCmode;
   16029              : 
   16030          213 :     handlecarry:
   16031          213 :       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
   16032          213 :       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
   16033          213 :       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
   16034          213 :       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
   16035              : 
   16036          213 :       op1 = expand_normal (arg0);
   16037              : 
   16038          213 :       op2 = expand_normal (arg1);
   16039          213 :       if (!register_operand (op2, mode0))
   16040          118 :         op2 = copy_to_mode_reg (mode0, op2);
   16041              : 
   16042          213 :       op3 = expand_normal (arg2);
   16043          213 :       if (!register_operand (op3, mode0))
   16044          121 :         op3 = copy_to_mode_reg (mode0, op3);
   16045              : 
   16046          213 :       op4 = expand_normal (arg3);
   16047          213 :       if (!address_operand (op4, VOIDmode))
   16048              :         {
   16049           68 :           op4 = convert_memory_address (Pmode, op4);
   16050           68 :           op4 = copy_addr_to_reg (op4);
   16051              :         }
   16052              : 
   16053          213 :       op0 = gen_reg_rtx (mode0);
   16054          213 :       if (op1 == const0_rtx)
   16055              :         {
   16056              :           /* If arg0 is 0, optimize right away into add or sub
   16057              :              instruction that sets CCCmode flags.  */
   16058           21 :           op1 = gen_rtx_REG (mode2, FLAGS_REG);
   16059           21 :           emit_insn (GEN_FCN (icode2) (op0, op2, op3));
   16060              :         }
   16061              :       else
   16062              :         {
   16063              :           /* Generate CF from input operand.  */
   16064          192 :           ix86_expand_carry (op1);
   16065              : 
   16066              :           /* Generate instruction that consumes CF.  */
   16067          192 :           op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
   16068          192 :           pat = gen_rtx_LTU (mode1, op1, const0_rtx);
   16069          192 :           pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
   16070          192 :           emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
   16071              :         }
   16072              : 
   16073              :       /* Return current CF value.  */
   16074          213 :       if (target == 0)
   16075           14 :         target = gen_reg_rtx (QImode);
   16076              : 
   16077          213 :       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
   16078          213 :       emit_insn (gen_rtx_SET (target, pat));
   16079              : 
   16080              :       /* Store the result.  */
   16081          213 :       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
   16082              : 
   16083          213 :       return target;
   16084              : 
   16085           24 :     case IX86_BUILTIN_READ_FLAGS:
   16086           24 :       if (ignore)
   16087            1 :         return const0_rtx;
   16088              : 
   16089           23 :       emit_insn (gen_pushfl ());
   16090              : 
   16091           23 :       if (optimize
   16092           11 :           || target == NULL_RTX
   16093           11 :           || !nonimmediate_operand (target, word_mode)
   16094           34 :           || GET_MODE (target) != word_mode)
   16095           12 :         target = gen_reg_rtx (word_mode);
   16096              : 
   16097           23 :       emit_insn (gen_pop (target));
   16098           23 :       return target;
   16099              : 
   16100           21 :     case IX86_BUILTIN_WRITE_FLAGS:
   16101              : 
   16102           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16103           21 :       op0 = expand_normal (arg0);
   16104           21 :       if (!general_no_elim_operand (op0, word_mode))
   16105            0 :         op0 = copy_to_mode_reg (word_mode, op0);
   16106              : 
   16107           21 :       emit_insn (gen_push (op0));
   16108           21 :       emit_insn (gen_popfl ());
   16109           21 :       return 0;
   16110              : 
   16111           22 :     case IX86_BUILTIN_KTESTC8:
   16112           22 :       icode = CODE_FOR_ktestqi;
   16113           22 :       mode3 = CCCmode;
   16114           22 :       goto kortest;
   16115              : 
   16116           22 :     case IX86_BUILTIN_KTESTZ8:
   16117           22 :       icode = CODE_FOR_ktestqi;
   16118           22 :       mode3 = CCZmode;
   16119           22 :       goto kortest;
   16120              : 
   16121           22 :     case IX86_BUILTIN_KTESTC16:
   16122           22 :       icode = CODE_FOR_ktesthi;
   16123           22 :       mode3 = CCCmode;
   16124           22 :       goto kortest;
   16125              : 
   16126           22 :     case IX86_BUILTIN_KTESTZ16:
   16127           22 :       icode = CODE_FOR_ktesthi;
   16128           22 :       mode3 = CCZmode;
   16129           22 :       goto kortest;
   16130              : 
   16131           22 :     case IX86_BUILTIN_KTESTC32:
   16132           22 :       icode = CODE_FOR_ktestsi;
   16133           22 :       mode3 = CCCmode;
   16134           22 :       goto kortest;
   16135              : 
   16136           22 :     case IX86_BUILTIN_KTESTZ32:
   16137           22 :       icode = CODE_FOR_ktestsi;
   16138           22 :       mode3 = CCZmode;
   16139           22 :       goto kortest;
   16140              : 
   16141           22 :     case IX86_BUILTIN_KTESTC64:
   16142           22 :       icode = CODE_FOR_ktestdi;
   16143           22 :       mode3 = CCCmode;
   16144           22 :       goto kortest;
   16145              : 
   16146           22 :     case IX86_BUILTIN_KTESTZ64:
   16147           22 :       icode = CODE_FOR_ktestdi;
   16148           22 :       mode3 = CCZmode;
   16149           22 :       goto kortest;
   16150              : 
   16151           22 :     case IX86_BUILTIN_KORTESTC8:
   16152           22 :       icode = CODE_FOR_kortestqi;
   16153           22 :       mode3 = CCCmode;
   16154           22 :       goto kortest;
   16155              : 
   16156           76 :     case IX86_BUILTIN_KORTESTZ8:
   16157           76 :       icode = CODE_FOR_kortestqi;
   16158           76 :       mode3 = CCZmode;
   16159           76 :       goto kortest;
   16160              : 
   16161           38 :     case IX86_BUILTIN_KORTESTC16:
   16162           38 :       icode = CODE_FOR_kortesthi;
   16163           38 :       mode3 = CCCmode;
   16164           38 :       goto kortest;
   16165              : 
   16166           91 :     case IX86_BUILTIN_KORTESTZ16:
   16167           91 :       icode = CODE_FOR_kortesthi;
   16168           91 :       mode3 = CCZmode;
   16169           91 :       goto kortest;
   16170              : 
   16171           22 :     case IX86_BUILTIN_KORTESTC32:
   16172           22 :       icode = CODE_FOR_kortestsi;
   16173           22 :       mode3 = CCCmode;
   16174           22 :       goto kortest;
   16175              : 
   16176           79 :     case IX86_BUILTIN_KORTESTZ32:
   16177           79 :       icode = CODE_FOR_kortestsi;
   16178           79 :       mode3 = CCZmode;
   16179           79 :       goto kortest;
   16180              : 
   16181           22 :     case IX86_BUILTIN_KORTESTC64:
   16182           22 :       icode = CODE_FOR_kortestdi;
   16183           22 :       mode3 = CCCmode;
   16184           22 :       goto kortest;
   16185              : 
   16186              :     case IX86_BUILTIN_KORTESTZ64:
   16187              :       icode = CODE_FOR_kortestdi;
   16188              :       mode3 = CCZmode;
   16189              : 
   16190          610 :     kortest:
   16191          610 :       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
   16192          610 :       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
   16193          610 :       op0 = expand_normal (arg0);
   16194          610 :       op1 = expand_normal (arg1);
   16195              : 
   16196          610 :       mode0 = insn_data[icode].operand[0].mode;
   16197          610 :       mode1 = insn_data[icode].operand[1].mode;
   16198              : 
   16199          610 :       if (GET_MODE (op0) != VOIDmode)
   16200          610 :         op0 = force_reg (GET_MODE (op0), op0);
   16201              : 
   16202          610 :       op0 = gen_lowpart (mode0, op0);
   16203              : 
   16204          610 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   16205            0 :         op0 = copy_to_mode_reg (mode0, op0);
   16206              : 
   16207          610 :       if (GET_MODE (op1) != VOIDmode)
   16208          609 :         op1 = force_reg (GET_MODE (op1), op1);
   16209              : 
   16210          610 :       op1 = gen_lowpart (mode1, op1);
   16211              : 
   16212          610 :       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   16213            1 :         op1 = copy_to_mode_reg (mode1, op1);
   16214              : 
   16215          610 :       target = gen_reg_rtx (QImode);
   16216              : 
   16217              :       /* Emit kortest.  */
   16218          610 :       emit_insn (GEN_FCN (icode) (op0, op1));
   16219              :       /* And use setcc to return result from flags.  */
   16220          610 :       ix86_expand_setcc (target, EQ,
   16221              :                          gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
   16222          610 :       return target;
   16223              : 
   16224           24 :     case IX86_BUILTIN_GATHERSIV2DF:
   16225           24 :       icode = CODE_FOR_avx2_gathersiv2df;
   16226           24 :       goto gather_gen;
   16227           18 :     case IX86_BUILTIN_GATHERSIV4DF:
   16228           18 :       icode = CODE_FOR_avx2_gathersiv4df;
   16229           18 :       goto gather_gen;
   16230           21 :     case IX86_BUILTIN_GATHERDIV2DF:
   16231           21 :       icode = CODE_FOR_avx2_gatherdiv2df;
   16232           21 :       goto gather_gen;
   16233           32 :     case IX86_BUILTIN_GATHERDIV4DF:
   16234           32 :       icode = CODE_FOR_avx2_gatherdiv4df;
   16235           32 :       goto gather_gen;
   16236           30 :     case IX86_BUILTIN_GATHERSIV4SF:
   16237           30 :       icode = CODE_FOR_avx2_gathersiv4sf;
   16238           30 :       goto gather_gen;
   16239           37 :     case IX86_BUILTIN_GATHERSIV8SF:
   16240           37 :       icode = CODE_FOR_avx2_gathersiv8sf;
   16241           37 :       goto gather_gen;
   16242           24 :     case IX86_BUILTIN_GATHERDIV4SF:
   16243           24 :       icode = CODE_FOR_avx2_gatherdiv4sf;
   16244           24 :       goto gather_gen;
   16245           18 :     case IX86_BUILTIN_GATHERDIV8SF:
   16246           18 :       icode = CODE_FOR_avx2_gatherdiv8sf;
   16247           18 :       goto gather_gen;
   16248           18 :     case IX86_BUILTIN_GATHERSIV2DI:
   16249           18 :       icode = CODE_FOR_avx2_gathersiv2di;
   16250           18 :       goto gather_gen;
   16251           18 :     case IX86_BUILTIN_GATHERSIV4DI:
   16252           18 :       icode = CODE_FOR_avx2_gathersiv4di;
   16253           18 :       goto gather_gen;
   16254           27 :     case IX86_BUILTIN_GATHERDIV2DI:
   16255           27 :       icode = CODE_FOR_avx2_gatherdiv2di;
   16256           27 :       goto gather_gen;
   16257           29 :     case IX86_BUILTIN_GATHERDIV4DI:
   16258           29 :       icode = CODE_FOR_avx2_gatherdiv4di;
   16259           29 :       goto gather_gen;
   16260           20 :     case IX86_BUILTIN_GATHERSIV4SI:
   16261           20 :       icode = CODE_FOR_avx2_gathersiv4si;
   16262           20 :       goto gather_gen;
   16263           22 :     case IX86_BUILTIN_GATHERSIV8SI:
   16264           22 :       icode = CODE_FOR_avx2_gathersiv8si;
   16265           22 :       goto gather_gen;
   16266           28 :     case IX86_BUILTIN_GATHERDIV4SI:
   16267           28 :       icode = CODE_FOR_avx2_gatherdiv4si;
   16268           28 :       goto gather_gen;
   16269           18 :     case IX86_BUILTIN_GATHERDIV8SI:
   16270           18 :       icode = CODE_FOR_avx2_gatherdiv8si;
   16271           18 :       goto gather_gen;
   16272           20 :     case IX86_BUILTIN_GATHERALTSIV4DF:
   16273           20 :       icode = CODE_FOR_avx2_gathersiv4df;
   16274           20 :       goto gather_gen;
   16275           16 :     case IX86_BUILTIN_GATHERALTDIV8SF:
   16276           16 :       icode = CODE_FOR_avx2_gatherdiv8sf;
   16277           16 :       goto gather_gen;
   16278            4 :     case IX86_BUILTIN_GATHERALTSIV4DI:
   16279            4 :       icode = CODE_FOR_avx2_gathersiv4di;
   16280            4 :       goto gather_gen;
   16281           12 :     case IX86_BUILTIN_GATHERALTDIV8SI:
   16282           12 :       icode = CODE_FOR_avx2_gatherdiv8si;
   16283           12 :       goto gather_gen;
   16284           36 :     case IX86_BUILTIN_GATHER3SIV16SF:
   16285           36 :       icode = CODE_FOR_avx512f_gathersiv16sf;
   16286           36 :       goto gather_gen;
   16287           24 :     case IX86_BUILTIN_GATHER3SIV8DF:
   16288           24 :       icode = CODE_FOR_avx512f_gathersiv8df;
   16289           24 :       goto gather_gen;
   16290           24 :     case IX86_BUILTIN_GATHER3DIV16SF:
   16291           24 :       icode = CODE_FOR_avx512f_gatherdiv16sf;
   16292           24 :       goto gather_gen;
   16293           37 :     case IX86_BUILTIN_GATHER3DIV8DF:
   16294           37 :       icode = CODE_FOR_avx512f_gatherdiv8df;
   16295           37 :       goto gather_gen;
   16296           30 :     case IX86_BUILTIN_GATHER3SIV16SI:
   16297           30 :       icode = CODE_FOR_avx512f_gathersiv16si;
   16298           30 :       goto gather_gen;
   16299           24 :     case IX86_BUILTIN_GATHER3SIV8DI:
   16300           24 :       icode = CODE_FOR_avx512f_gathersiv8di;
   16301           24 :       goto gather_gen;
   16302           24 :     case IX86_BUILTIN_GATHER3DIV16SI:
   16303           24 :       icode = CODE_FOR_avx512f_gatherdiv16si;
   16304           24 :       goto gather_gen;
   16305           37 :     case IX86_BUILTIN_GATHER3DIV8DI:
   16306           37 :       icode = CODE_FOR_avx512f_gatherdiv8di;
   16307           37 :       goto gather_gen;
   16308           16 :     case IX86_BUILTIN_GATHER3ALTSIV8DF:
   16309           16 :       icode = CODE_FOR_avx512f_gathersiv8df;
   16310           16 :       goto gather_gen;
   16311           22 :     case IX86_BUILTIN_GATHER3ALTDIV16SF:
   16312           22 :       icode = CODE_FOR_avx512f_gatherdiv16sf;
   16313           22 :       goto gather_gen;
   16314           14 :     case IX86_BUILTIN_GATHER3ALTSIV8DI:
   16315           14 :       icode = CODE_FOR_avx512f_gathersiv8di;
   16316           14 :       goto gather_gen;
   16317           18 :     case IX86_BUILTIN_GATHER3ALTDIV16SI:
   16318           18 :       icode = CODE_FOR_avx512f_gatherdiv16si;
   16319           18 :       goto gather_gen;
   16320           18 :     case IX86_BUILTIN_GATHER3SIV2DF:
   16321           18 :       icode = CODE_FOR_avx512vl_gathersiv2df;
   16322           18 :       goto gather_gen;
   16323           10 :     case IX86_BUILTIN_GATHER3SIV4DF:
   16324           10 :       icode = CODE_FOR_avx512vl_gathersiv4df;
   16325           10 :       goto gather_gen;
   16326           15 :     case IX86_BUILTIN_GATHER3DIV2DF:
   16327           15 :       icode = CODE_FOR_avx512vl_gatherdiv2df;
   16328           15 :       goto gather_gen;
   16329           16 :     case IX86_BUILTIN_GATHER3DIV4DF:
   16330           16 :       icode = CODE_FOR_avx512vl_gatherdiv4df;
   16331           16 :       goto gather_gen;
   16332           14 :     case IX86_BUILTIN_GATHER3SIV4SF:
   16333           14 :       icode = CODE_FOR_avx512vl_gathersiv4sf;
   16334           14 :       goto gather_gen;
   16335           12 :     case IX86_BUILTIN_GATHER3SIV8SF:
   16336           12 :       icode = CODE_FOR_avx512vl_gathersiv8sf;
   16337           12 :       goto gather_gen;
   16338           22 :     case IX86_BUILTIN_GATHER3DIV4SF:
   16339           22 :       icode = CODE_FOR_avx512vl_gatherdiv4sf;
   16340           22 :       goto gather_gen;
   16341           10 :     case IX86_BUILTIN_GATHER3DIV8SF:
   16342           10 :       icode = CODE_FOR_avx512vl_gatherdiv8sf;
   16343           10 :       goto gather_gen;
   16344           20 :     case IX86_BUILTIN_GATHER3SIV2DI:
   16345           20 :       icode = CODE_FOR_avx512vl_gathersiv2di;
   16346           20 :       goto gather_gen;
   16347           10 :     case IX86_BUILTIN_GATHER3SIV4DI:
   16348           10 :       icode = CODE_FOR_avx512vl_gathersiv4di;
   16349           10 :       goto gather_gen;
   16350           14 :     case IX86_BUILTIN_GATHER3DIV2DI:
   16351           14 :       icode = CODE_FOR_avx512vl_gatherdiv2di;
   16352           14 :       goto gather_gen;
   16353           13 :     case IX86_BUILTIN_GATHER3DIV4DI:
   16354           13 :       icode = CODE_FOR_avx512vl_gatherdiv4di;
   16355           13 :       goto gather_gen;
   16356           14 :     case IX86_BUILTIN_GATHER3SIV4SI:
   16357           14 :       icode = CODE_FOR_avx512vl_gathersiv4si;
   16358           14 :       goto gather_gen;
   16359           12 :     case IX86_BUILTIN_GATHER3SIV8SI:
   16360           12 :       icode = CODE_FOR_avx512vl_gathersiv8si;
   16361           12 :       goto gather_gen;
   16362           24 :     case IX86_BUILTIN_GATHER3DIV4SI:
   16363           24 :       icode = CODE_FOR_avx512vl_gatherdiv4si;
   16364           24 :       goto gather_gen;
   16365           10 :     case IX86_BUILTIN_GATHER3DIV8SI:
   16366           10 :       icode = CODE_FOR_avx512vl_gatherdiv8si;
   16367           10 :       goto gather_gen;
   16368            4 :     case IX86_BUILTIN_GATHER3ALTSIV4DF:
   16369            4 :       icode = CODE_FOR_avx512vl_gathersiv4df;
   16370            4 :       goto gather_gen;
   16371            8 :     case IX86_BUILTIN_GATHER3ALTDIV8SF:
   16372            8 :       icode = CODE_FOR_avx512vl_gatherdiv8sf;
   16373            8 :       goto gather_gen;
   16374            6 :     case IX86_BUILTIN_GATHER3ALTSIV4DI:
   16375            6 :       icode = CODE_FOR_avx512vl_gathersiv4di;
   16376            6 :       goto gather_gen;
   16377           10 :     case IX86_BUILTIN_GATHER3ALTDIV8SI:
   16378           10 :       icode = CODE_FOR_avx512vl_gatherdiv8si;
   16379           10 :       goto gather_gen;
   16380           40 :     case IX86_BUILTIN_SCATTERSIV16SF:
   16381           40 :       icode = CODE_FOR_avx512f_scattersiv16sf;
   16382           40 :       goto scatter_gen;
   16383           27 :     case IX86_BUILTIN_SCATTERSIV8DF:
   16384           27 :       icode = CODE_FOR_avx512f_scattersiv8df;
   16385           27 :       goto scatter_gen;
   16386           24 :     case IX86_BUILTIN_SCATTERDIV16SF:
   16387           24 :       icode = CODE_FOR_avx512f_scatterdiv16sf;
   16388           24 :       goto scatter_gen;
   16389           33 :     case IX86_BUILTIN_SCATTERDIV8DF:
   16390           33 :       icode = CODE_FOR_avx512f_scatterdiv8df;
   16391           33 :       goto scatter_gen;
   16392           30 :     case IX86_BUILTIN_SCATTERSIV16SI:
   16393           30 :       icode = CODE_FOR_avx512f_scattersiv16si;
   16394           30 :       goto scatter_gen;
   16395           24 :     case IX86_BUILTIN_SCATTERSIV8DI:
   16396           24 :       icode = CODE_FOR_avx512f_scattersiv8di;
   16397           24 :       goto scatter_gen;
   16398           24 :     case IX86_BUILTIN_SCATTERDIV16SI:
   16399           24 :       icode = CODE_FOR_avx512f_scatterdiv16si;
   16400           24 :       goto scatter_gen;
   16401           29 :     case IX86_BUILTIN_SCATTERDIV8DI:
   16402           29 :       icode = CODE_FOR_avx512f_scatterdiv8di;
   16403           29 :       goto scatter_gen;
   16404           18 :     case IX86_BUILTIN_SCATTERSIV8SF:
   16405           18 :       icode = CODE_FOR_avx512vl_scattersiv8sf;
   16406           18 :       goto scatter_gen;
   16407           20 :     case IX86_BUILTIN_SCATTERSIV4SF:
   16408           20 :       icode = CODE_FOR_avx512vl_scattersiv4sf;
   16409           20 :       goto scatter_gen;
   16410           16 :     case IX86_BUILTIN_SCATTERSIV4DF:
   16411           16 :       icode = CODE_FOR_avx512vl_scattersiv4df;
   16412           16 :       goto scatter_gen;
   16413           16 :     case IX86_BUILTIN_SCATTERSIV2DF:
   16414           16 :       icode = CODE_FOR_avx512vl_scattersiv2df;
   16415           16 :       goto scatter_gen;
   16416           16 :     case IX86_BUILTIN_SCATTERDIV8SF:
   16417           16 :       icode = CODE_FOR_avx512vl_scatterdiv8sf;
   16418           16 :       goto scatter_gen;
   16419           16 :     case IX86_BUILTIN_SCATTERDIV4SF:
   16420           16 :       icode = CODE_FOR_avx512vl_scatterdiv4sf;
   16421           16 :       goto scatter_gen;
   16422           18 :     case IX86_BUILTIN_SCATTERDIV4DF:
   16423           18 :       icode = CODE_FOR_avx512vl_scatterdiv4df;
   16424           18 :       goto scatter_gen;
   16425           18 :     case IX86_BUILTIN_SCATTERDIV2DF:
   16426           18 :       icode = CODE_FOR_avx512vl_scatterdiv2df;
   16427           18 :       goto scatter_gen;
   16428           22 :     case IX86_BUILTIN_SCATTERSIV8SI:
   16429           22 :       icode = CODE_FOR_avx512vl_scattersiv8si;
   16430           22 :       goto scatter_gen;
   16431           24 :     case IX86_BUILTIN_SCATTERSIV4SI:
   16432           24 :       icode = CODE_FOR_avx512vl_scattersiv4si;
   16433           24 :       goto scatter_gen;
   16434           16 :     case IX86_BUILTIN_SCATTERSIV4DI:
   16435           16 :       icode = CODE_FOR_avx512vl_scattersiv4di;
   16436           16 :       goto scatter_gen;
   16437           16 :     case IX86_BUILTIN_SCATTERSIV2DI:
   16438           16 :       icode = CODE_FOR_avx512vl_scattersiv2di;
   16439           16 :       goto scatter_gen;
   16440           16 :     case IX86_BUILTIN_SCATTERDIV8SI:
   16441           16 :       icode = CODE_FOR_avx512vl_scatterdiv8si;
   16442           16 :       goto scatter_gen;
   16443           16 :     case IX86_BUILTIN_SCATTERDIV4SI:
   16444           16 :       icode = CODE_FOR_avx512vl_scatterdiv4si;
   16445           16 :       goto scatter_gen;
   16446           18 :     case IX86_BUILTIN_SCATTERDIV4DI:
   16447           18 :       icode = CODE_FOR_avx512vl_scatterdiv4di;
   16448           18 :       goto scatter_gen;
   16449           18 :     case IX86_BUILTIN_SCATTERDIV2DI:
   16450           18 :       icode = CODE_FOR_avx512vl_scatterdiv2di;
   16451           18 :       goto scatter_gen;
   16452           16 :     case IX86_BUILTIN_SCATTERALTSIV8DF:
   16453           16 :       icode = CODE_FOR_avx512f_scattersiv8df;
   16454           16 :       goto scatter_gen;
   16455           12 :     case IX86_BUILTIN_SCATTERALTDIV16SF:
   16456           12 :       icode = CODE_FOR_avx512f_scatterdiv16sf;
   16457           12 :       goto scatter_gen;
   16458            8 :     case IX86_BUILTIN_SCATTERALTSIV8DI:
   16459            8 :       icode = CODE_FOR_avx512f_scattersiv8di;
   16460            8 :       goto scatter_gen;
   16461           24 :     case IX86_BUILTIN_SCATTERALTDIV16SI:
   16462           24 :       icode = CODE_FOR_avx512f_scatterdiv16si;
   16463           24 :       goto scatter_gen;
   16464            4 :     case IX86_BUILTIN_SCATTERALTSIV4DF:
   16465            4 :       icode = CODE_FOR_avx512vl_scattersiv4df;
   16466            4 :       goto scatter_gen;
   16467            4 :     case IX86_BUILTIN_SCATTERALTDIV8SF:
   16468            4 :       icode = CODE_FOR_avx512vl_scatterdiv8sf;
   16469            4 :       goto scatter_gen;
   16470            4 :     case IX86_BUILTIN_SCATTERALTSIV4DI:
   16471            4 :       icode = CODE_FOR_avx512vl_scattersiv4di;
   16472            4 :       goto scatter_gen;
   16473            4 :     case IX86_BUILTIN_SCATTERALTDIV8SI:
   16474            4 :       icode = CODE_FOR_avx512vl_scatterdiv8si;
   16475            4 :       goto scatter_gen;
   16476            8 :     case IX86_BUILTIN_SCATTERALTSIV2DF:
   16477            8 :       icode = CODE_FOR_avx512vl_scattersiv2df;
   16478            8 :       goto scatter_gen;
   16479            8 :     case IX86_BUILTIN_SCATTERALTDIV4SF:
   16480            8 :       icode = CODE_FOR_avx512vl_scatterdiv4sf;
   16481            8 :       goto scatter_gen;
   16482            8 :     case IX86_BUILTIN_SCATTERALTSIV2DI:
   16483            8 :       icode = CODE_FOR_avx512vl_scattersiv2di;
   16484            8 :       goto scatter_gen;
   16485            8 :     case IX86_BUILTIN_SCATTERALTDIV4SI:
   16486            8 :       icode = CODE_FOR_avx512vl_scatterdiv4si;
   16487            8 :       goto scatter_gen;
   16488              : 
   16489         1004 :     gather_gen:
   16490         1004 :       rtx half;
   16491         1004 :       rtx (*gen) (rtx, rtx);
   16492              : 
   16493         1004 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16494         1004 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16495         1004 :       arg2 = CALL_EXPR_ARG (exp, 2);
   16496         1004 :       arg3 = CALL_EXPR_ARG (exp, 3);
   16497         1004 :       arg4 = CALL_EXPR_ARG (exp, 4);
   16498         1004 :       op0 = expand_normal (arg0);
   16499         1004 :       op1 = expand_normal (arg1);
   16500         1004 :       op2 = expand_normal (arg2);
   16501         1004 :       op3 = ix86_expand_unsigned_small_int_cst_argument (arg3);
   16502         1004 :       op4 = expand_normal (arg4);
   16503              :       /* Note the arg order is different from the operand order.  */
   16504         1004 :       mode0 = insn_data[icode].operand[1].mode;
   16505         1004 :       mode2 = insn_data[icode].operand[3].mode;
   16506         1004 :       mode3 = insn_data[icode].operand[4].mode;
   16507         1004 :       mode4 = insn_data[icode].operand[5].mode;
   16508              : 
   16509         1004 :       if (target == NULL_RTX
   16510         1004 :           || GET_MODE (target) != insn_data[icode].operand[0].mode
   16511         1904 :           || !insn_data[icode].operand[0].predicate (target,
   16512              :                                                      GET_MODE (target)))
   16513          105 :         subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
   16514              :       else
   16515              :         subtarget = target;
   16516              : 
   16517         1004 :       switch (fcode)
   16518              :         {
   16519           30 :         case IX86_BUILTIN_GATHER3ALTSIV8DF:
   16520           30 :         case IX86_BUILTIN_GATHER3ALTSIV8DI:
   16521           30 :           half = gen_reg_rtx (V8SImode);
   16522           30 :           if (!nonimmediate_operand (op2, V16SImode))
   16523            0 :             op2 = copy_to_mode_reg (V16SImode, op2);
   16524           30 :           emit_insn (gen_vec_extract_lo_v16si (half, op2));
   16525           30 :           op2 = half;
   16526           30 :           break;
   16527           34 :         case IX86_BUILTIN_GATHER3ALTSIV4DF:
   16528           34 :         case IX86_BUILTIN_GATHER3ALTSIV4DI:
   16529           34 :         case IX86_BUILTIN_GATHERALTSIV4DF:
   16530           34 :         case IX86_BUILTIN_GATHERALTSIV4DI:
   16531           34 :           half = gen_reg_rtx (V4SImode);
   16532           34 :           if (!nonimmediate_operand (op2, V8SImode))
   16533            0 :             op2 = copy_to_mode_reg (V8SImode, op2);
   16534           34 :           emit_insn (gen_vec_extract_lo_v8si (half, op2));
   16535           34 :           op2 = half;
   16536           34 :           break;
   16537           40 :         case IX86_BUILTIN_GATHER3ALTDIV16SF:
   16538           40 :         case IX86_BUILTIN_GATHER3ALTDIV16SI:
   16539           40 :           half = gen_reg_rtx (mode0);
   16540           40 :           if (mode0 == V8SFmode)
   16541              :             gen = gen_vec_extract_lo_v16sf;
   16542              :           else
   16543           18 :             gen = gen_vec_extract_lo_v16si;
   16544           40 :           if (!nonimmediate_operand (op0, GET_MODE (op0)))
   16545           40 :             op0 = copy_to_mode_reg (GET_MODE (op0), op0);
   16546           40 :           emit_insn (gen (half, op0));
   16547           40 :           op0 = half;
   16548           40 :           op3 = lowpart_subreg (QImode, op3, HImode);
   16549           40 :           break;
   16550           46 :         case IX86_BUILTIN_GATHER3ALTDIV8SF:
   16551           46 :         case IX86_BUILTIN_GATHER3ALTDIV8SI:
   16552           46 :         case IX86_BUILTIN_GATHERALTDIV8SF:
   16553           46 :         case IX86_BUILTIN_GATHERALTDIV8SI:
   16554           46 :           half = gen_reg_rtx (mode0);
   16555           46 :           if (mode0 == V4SFmode)
   16556              :             gen = gen_vec_extract_lo_v8sf;
   16557              :           else
   16558           22 :             gen = gen_vec_extract_lo_v8si;
   16559           46 :           if (!nonimmediate_operand (op0, GET_MODE (op0)))
   16560           46 :             op0 = copy_to_mode_reg (GET_MODE (op0), op0);
   16561           46 :           emit_insn (gen (half, op0));
   16562           46 :           op0 = half;
   16563           46 :           if (VECTOR_MODE_P (GET_MODE (op3)))
   16564              :             {
   16565           28 :               half = gen_reg_rtx (mode0);
   16566           28 :               if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16567           12 :                 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16568           28 :               emit_insn (gen (half, op3));
   16569           28 :               op3 = half;
   16570              :             }
   16571              :           break;
   16572              :         default:
   16573              :           break;
   16574              :         }
   16575              : 
   16576              :       /* Force memory operand only with base register here.  But we
   16577              :          don't want to do it on memory operand for other builtin
   16578              :          functions.  */
   16579         1004 :       op1 = ix86_zero_extend_to_Pmode (op1);
   16580              : 
   16581         1004 :       if (!insn_data[icode].operand[1].predicate (op0, mode0))
   16582          403 :         op0 = copy_to_mode_reg (mode0, op0);
   16583         1009 :       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
   16584            0 :         op1 = copy_to_mode_reg (Pmode, op1);
   16585         1004 :       if (!insn_data[icode].operand[3].predicate (op2, mode2))
   16586          221 :         op2 = copy_to_mode_reg (mode2, op2);
   16587              : 
   16588         1004 :       op3 = fixup_modeless_constant (op3, mode3);
   16589              : 
   16590         1004 :       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
   16591              :         {
   16592         1004 :           if (!insn_data[icode].operand[4].predicate (op3, mode3))
   16593          356 :             op3 = copy_to_mode_reg (mode3, op3);
   16594              :         }
   16595              :       else
   16596              :         {
   16597            0 :           op3 = copy_to_reg (op3);
   16598            0 :           op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
   16599              :         }
   16600         1004 :       if (!insn_data[icode].operand[5].predicate (op4, mode4))
   16601              :         {
   16602            0 :           error ("the last argument must be scale 1, 2, 4, 8");
   16603            0 :           return const0_rtx;
   16604              :         }
   16605              : 
   16606              :       /* Optimize.  If mask is known to have all high bits set,
   16607              :          replace op0 with pc_rtx to signal that the instruction
   16608              :          overwrites the whole destination and doesn't use its
   16609              :          previous contents.  */
   16610         1004 :       if (optimize)
   16611              :         {
   16612          914 :           if (TREE_CODE (arg3) == INTEGER_CST)
   16613              :             {
   16614          209 :               if (integer_all_onesp (arg3))
   16615          201 :                 op0 = pc_rtx;
   16616              :             }
   16617          705 :           else if (TREE_CODE (arg3) == VECTOR_CST)
   16618              :             {
   16619              :               unsigned int negative = 0;
   16620          755 :               for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
   16621              :                 {
   16622          620 :                   tree cst = VECTOR_CST_ELT (arg3, i);
   16623          620 :                   if (TREE_CODE (cst) == INTEGER_CST
   16624          620 :                       && tree_int_cst_sign_bit (cst))
   16625          286 :                     negative++;
   16626          334 :                   else if (TREE_CODE (cst) == REAL_CST
   16627          334 :                            && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
   16628          306 :                     negative++;
   16629              :                 }
   16630          135 :               if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
   16631          121 :                 op0 = pc_rtx;
   16632              :             }
   16633          570 :           else if (TREE_CODE (arg3) == SSA_NAME
   16634          570 :                    && VECTOR_TYPE_P (TREE_TYPE (arg3)))
   16635              :             {
   16636              :               /* Recognize also when mask is like:
   16637              :                  __v2df src = _mm_setzero_pd ();
   16638              :                  __v2df mask = _mm_cmpeq_pd (src, src);
   16639              :                  or
   16640              :                  __v8sf src = _mm256_setzero_ps ();
   16641              :                  __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
   16642              :                  as that is a cheaper way to load all ones into
   16643              :                  a register than having to load a constant from
   16644              :                  memory.  */
   16645          259 :               gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
   16646          259 :               if (is_gimple_call (def_stmt))
   16647              :                 {
   16648           76 :                   tree fndecl = gimple_call_fndecl (def_stmt);
   16649           76 :                   if (fndecl
   16650           76 :                       && fndecl_built_in_p (fndecl, BUILT_IN_MD))
   16651           67 :                     switch (DECL_MD_FUNCTION_CODE (fndecl))
   16652              :                       {
   16653           24 :                       case IX86_BUILTIN_CMPPD:
   16654           24 :                       case IX86_BUILTIN_CMPPS:
   16655           24 :                       case IX86_BUILTIN_CMPPD256:
   16656           24 :                       case IX86_BUILTIN_CMPPS256:
   16657           24 :                         if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
   16658              :                           break;
   16659              :                         /* FALLTHRU */
   16660           49 :                       case IX86_BUILTIN_CMPEQPD:
   16661           49 :                       case IX86_BUILTIN_CMPEQPS:
   16662           49 :                         if (initializer_zerop (gimple_call_arg (def_stmt, 0))
   16663           49 :                             && initializer_zerop (gimple_call_arg (def_stmt,
   16664              :                                                                    1)))
   16665           49 :                           op0 = pc_rtx;
   16666              :                         break;
   16667              :                       default:
   16668              :                         break;
   16669              :                       }
   16670              :                 }
   16671              :             }
   16672              :         }
   16673              : 
   16674         1004 :       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
   16675         1004 :       if (! pat)
   16676            0 :         return const0_rtx;
   16677         1004 :       emit_insn (pat);
   16678              : 
   16679         1004 :       switch (fcode)
   16680              :         {
   16681           24 :         case IX86_BUILTIN_GATHER3DIV16SF:
   16682           24 :           if (target == NULL_RTX)
   16683            0 :             target = gen_reg_rtx (V8SFmode);
   16684           24 :           emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
   16685           24 :           break;
   16686           24 :         case IX86_BUILTIN_GATHER3DIV16SI:
   16687           24 :           if (target == NULL_RTX)
   16688            0 :             target = gen_reg_rtx (V8SImode);
   16689           24 :           emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
   16690           24 :           break;
   16691           28 :         case IX86_BUILTIN_GATHER3DIV8SF:
   16692           28 :         case IX86_BUILTIN_GATHERDIV8SF:
   16693           28 :           if (target == NULL_RTX)
   16694            0 :             target = gen_reg_rtx (V4SFmode);
   16695           28 :           emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
   16696           28 :           break;
   16697           28 :         case IX86_BUILTIN_GATHER3DIV8SI:
   16698           28 :         case IX86_BUILTIN_GATHERDIV8SI:
   16699           28 :           if (target == NULL_RTX)
   16700            0 :             target = gen_reg_rtx (V4SImode);
   16701           28 :           emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
   16702           28 :           break;
   16703              :         default:
   16704              :           target = subtarget;
   16705              :           break;
   16706              :         }
   16707              :       return target;
   16708              : 
   16709          623 :     scatter_gen:
   16710          623 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16711          623 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16712          623 :       arg2 = CALL_EXPR_ARG (exp, 2);
   16713          623 :       arg3 = CALL_EXPR_ARG (exp, 3);
   16714          623 :       arg4 = CALL_EXPR_ARG (exp, 4);
   16715          623 :       op0 = expand_normal (arg0);
   16716          623 :       op1 = ix86_expand_unsigned_small_int_cst_argument (arg1);
   16717          623 :       op2 = expand_normal (arg2);
   16718          623 :       op3 = expand_normal (arg3);
   16719          623 :       op4 = expand_normal (arg4);
   16720          623 :       mode1 = insn_data[icode].operand[1].mode;
   16721          623 :       mode2 = insn_data[icode].operand[2].mode;
   16722          623 :       mode3 = insn_data[icode].operand[3].mode;
   16723          623 :       mode4 = insn_data[icode].operand[4].mode;
   16724              : 
   16725              :       /* Scatter instruction stores operand op3 to memory with
   16726              :          indices from op2 and scale from op4 under writemask op1.
   16727              :          If index operand op2 has more elements then source operand
   16728              :          op3 one need to use only its low half. And vice versa.  */
   16729          623 :       switch (fcode)
   16730              :         {
   16731           24 :         case IX86_BUILTIN_SCATTERALTSIV8DF:
   16732           24 :         case IX86_BUILTIN_SCATTERALTSIV8DI:
   16733           24 :           half = gen_reg_rtx (V8SImode);
   16734           24 :           if (!nonimmediate_operand (op2, V16SImode))
   16735            0 :             op2 = copy_to_mode_reg (V16SImode, op2);
   16736           24 :           emit_insn (gen_vec_extract_lo_v16si (half, op2));
   16737           24 :           op2 = half;
   16738           24 :           break;
   16739           36 :         case IX86_BUILTIN_SCATTERALTDIV16SF:
   16740           36 :         case IX86_BUILTIN_SCATTERALTDIV16SI:
   16741           36 :           half = gen_reg_rtx (mode3);
   16742           36 :           if (mode3 == V8SFmode)
   16743              :             gen = gen_vec_extract_lo_v16sf;
   16744              :           else
   16745           24 :             gen = gen_vec_extract_lo_v16si;
   16746           36 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16747            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16748           36 :           emit_insn (gen (half, op3));
   16749           36 :           op3 = half;
   16750           36 :           break;
   16751            8 :         case IX86_BUILTIN_SCATTERALTSIV4DF:
   16752            8 :         case IX86_BUILTIN_SCATTERALTSIV4DI:
   16753            8 :           half = gen_reg_rtx (V4SImode);
   16754            8 :           if (!nonimmediate_operand (op2, V8SImode))
   16755            0 :             op2 = copy_to_mode_reg (V8SImode, op2);
   16756            8 :           emit_insn (gen_vec_extract_lo_v8si (half, op2));
   16757            8 :           op2 = half;
   16758            8 :           break;
   16759            8 :         case IX86_BUILTIN_SCATTERALTDIV8SF:
   16760            8 :         case IX86_BUILTIN_SCATTERALTDIV8SI:
   16761            8 :           half = gen_reg_rtx (mode3);
   16762            8 :           if (mode3 == V4SFmode)
   16763              :             gen = gen_vec_extract_lo_v8sf;
   16764              :           else
   16765            4 :             gen = gen_vec_extract_lo_v8si;
   16766            8 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16767            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16768            8 :           emit_insn (gen (half, op3));
   16769            8 :           op3 = half;
   16770            8 :           break;
   16771           16 :         case IX86_BUILTIN_SCATTERALTSIV2DF:
   16772           16 :         case IX86_BUILTIN_SCATTERALTSIV2DI:
   16773           16 :           if (!nonimmediate_operand (op2, V4SImode))
   16774            0 :             op2 = copy_to_mode_reg (V4SImode, op2);
   16775              :           break;
   16776           16 :         case IX86_BUILTIN_SCATTERALTDIV4SF:
   16777           16 :         case IX86_BUILTIN_SCATTERALTDIV4SI:
   16778           16 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16779            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16780              :           break;
   16781              :         default:
   16782              :           break;
   16783              :         }
   16784              : 
   16785              :       /* Force memory operand only with base register here.  But we
   16786              :          don't want to do it on memory operand for other builtin
   16787              :          functions.  */
   16788          633 :       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
   16789              : 
   16790          628 :       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   16791            0 :         op0 = copy_to_mode_reg (Pmode, op0);
   16792              : 
   16793          623 :       op1 = fixup_modeless_constant (op1, mode1);
   16794              : 
   16795          623 :       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
   16796              :         {
   16797          607 :           if (!insn_data[icode].operand[1].predicate (op1, mode1))
   16798          273 :             op1 = copy_to_mode_reg (mode1, op1);
   16799              :         }
   16800              :       else
   16801              :         {
   16802           16 :           op1 = copy_to_reg (op1);
   16803           16 :           op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
   16804              :         }
   16805              : 
   16806          623 :       if (!insn_data[icode].operand[2].predicate (op2, mode2))
   16807           57 :         op2 = copy_to_mode_reg (mode2, op2);
   16808              : 
   16809          623 :       if (!insn_data[icode].operand[3].predicate (op3, mode3))
   16810           82 :         op3 = copy_to_mode_reg (mode3, op3);
   16811              : 
   16812          623 :       if (!insn_data[icode].operand[4].predicate (op4, mode4))
   16813              :         {
   16814            0 :           error ("the last argument must be scale 1, 2, 4, 8");
   16815            0 :           return const0_rtx;
   16816              :         }
   16817              : 
   16818          623 :       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
   16819          623 :       if (! pat)
   16820            0 :         return const0_rtx;
   16821              : 
   16822          623 :       emit_insn (pat);
   16823          623 :       return 0;
   16824              : 
   16825           23 :     case IX86_BUILTIN_XABORT:
   16826           23 :       icode = CODE_FOR_xabort;
   16827           23 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16828           23 :       op0 = expand_normal (arg0);
   16829           23 :       mode0 = insn_data[icode].operand[0].mode;
   16830           23 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   16831              :         {
   16832            0 :           error ("the argument to %<xabort%> intrinsic must "
   16833              :                  "be an 8-bit immediate");
   16834            0 :           return const0_rtx;
   16835              :         }
   16836           23 :       emit_insn (gen_xabort (op0));
   16837           23 :       return 0;
   16838              : 
   16839           55 :     case IX86_BUILTIN_RDSSPD:
   16840           55 :     case IX86_BUILTIN_RDSSPQ:
   16841           55 :       mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
   16842              : 
   16843           55 :       if (target == 0
   16844           55 :           || !register_operand (target, mode))
   16845            0 :         target = gen_reg_rtx (mode);
   16846              : 
   16847           55 :       op0 = force_reg (mode, const0_rtx);
   16848              : 
   16849           55 :       emit_insn (gen_rdssp (mode, target, op0));
   16850           55 :       return target;
   16851              : 
   16852           55 :     case IX86_BUILTIN_INCSSPD:
   16853           55 :     case IX86_BUILTIN_INCSSPQ:
   16854           55 :       mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
   16855              : 
   16856           55 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16857           55 :       op0 = expand_normal (arg0);
   16858              : 
   16859           55 :       op0 = force_reg (mode, op0);
   16860              : 
   16861           55 :       emit_insn (gen_incssp (mode, op0));
   16862           55 :       return 0;
   16863              : 
   16864           20 :     case IX86_BUILTIN_HRESET:
   16865           20 :       icode = CODE_FOR_hreset;
   16866           20 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16867           20 :       op0 = expand_normal (arg0);
   16868           20 :       op0 = force_reg (SImode, op0);
   16869           20 :       emit_insn (gen_hreset (op0));
   16870           20 :       return 0;
   16871              : 
   16872           38 :     case IX86_BUILTIN_RSTORSSP:
   16873           38 :     case IX86_BUILTIN_CLRSSBSY:
   16874           38 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16875           38 :       op0 = expand_normal (arg0);
   16876           19 :       icode = (fcode == IX86_BUILTIN_RSTORSSP
   16877           38 :                ? CODE_FOR_rstorssp
   16878              :                : CODE_FOR_clrssbsy);
   16879              : 
   16880           38 :       if (!address_operand (op0, VOIDmode))
   16881              :         {
   16882           18 :           op0 = convert_memory_address (Pmode, op0);
   16883           18 :           op0 = copy_addr_to_reg (op0);
   16884              :         }
   16885           38 :       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
   16886           38 :       return 0;
   16887              : 
   16888           80 :     case IX86_BUILTIN_WRSSD:
   16889           80 :     case IX86_BUILTIN_WRSSQ:
   16890           80 :     case IX86_BUILTIN_WRUSSD:
   16891           80 :     case IX86_BUILTIN_WRUSSQ:
   16892           80 :       mode = ((fcode == IX86_BUILTIN_WRSSD
   16893           80 :                || fcode == IX86_BUILTIN_WRUSSD)
   16894           80 :               ? SImode : DImode);
   16895              : 
   16896           80 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16897           80 :       op0 = expand_normal (arg0);
   16898           80 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16899           80 :       op1 = expand_normal (arg1);
   16900              : 
   16901           80 :       op0 = force_reg (mode, op0);
   16902              : 
   16903           80 :       if (!address_operand (op1, VOIDmode))
   16904              :         {
   16905           36 :           op1 = convert_memory_address (Pmode, op1);
   16906           36 :           op1 = copy_addr_to_reg (op1);
   16907              :         }
   16908           80 :       op1 = gen_rtx_MEM (mode, op1);
   16909              : 
   16910           80 :       icode = ((fcode == IX86_BUILTIN_WRSSD
   16911           80 :                 || fcode == IX86_BUILTIN_WRSSQ)
   16912           80 :                ? code_for_wrss (mode)
   16913           40 :                : code_for_wruss (mode));
   16914           80 :       emit_insn (GEN_FCN (icode) (op0, op1));
   16915              : 
   16916           80 :       return 0;
   16917              : 
   16918       116625 :     default:
   16919       116625 :       break;
   16920              :     }
   16921              : 
   16922       116625 :   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
   16923       116625 :       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
   16924              :     {
   16925        27059 :       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
   16926        27059 :       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
   16927        27059 :                                                target);
   16928              :     }
   16929              : 
   16930        89566 :   if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
   16931        89566 :       && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
   16932              :     {
   16933           93 :       i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
   16934           93 :       return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
   16935           93 :                                                target);
   16936              :     }
   16937              : 
   16938        89473 :   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
   16939        89473 :       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
   16940              :     {
   16941        71052 :       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
   16942              : 
   16943        71052 :       switch (fcode)
   16944              :         {
   16945            0 :           case IX86_BUILTIN_RDPID:
   16946            0 :             return ix86_expand_special_args_builtin (bdesc_args + i, exp,
   16947            0 :                                                      target);
   16948           74 :           case IX86_BUILTIN_VCOMISBF16EQ:
   16949           74 :           case IX86_BUILTIN_VCOMISBF16NE:
   16950           74 :           case IX86_BUILTIN_VCOMISBF16GT:
   16951           74 :           case IX86_BUILTIN_VCOMISBF16GE:
   16952           74 :           case IX86_BUILTIN_VCOMISBF16LT:
   16953           74 :           case IX86_BUILTIN_VCOMISBF16LE:
   16954           74 :             return ix86_expand_sse_comi (bdesc_args + i, exp, target, false);
   16955           15 :           case IX86_BUILTIN_FABSQ:
   16956           15 :           case IX86_BUILTIN_COPYSIGNQ:
   16957           15 :             if (!TARGET_SSE)
   16958              :               /* Emit a normal call if SSE isn't available.  */
   16959            0 :               return expand_call (exp, target, ignore);
   16960              :             /* FALLTHRU */
   16961        70978 :           default:
   16962        70978 :             return ix86_expand_args_builtin (bdesc_args + i, exp, target);
   16963              :           }
   16964              :     }
   16965              : 
   16966        18421 :   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
   16967        18421 :       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
   16968              :     {
   16969          473 :       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
   16970          473 :       return ix86_expand_sse_comi (bdesc_comi + i, exp, target, true);
   16971              :     }
   16972              : 
   16973        17948 :   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
   16974        17948 :       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
   16975              :     {
   16976        15604 :       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
   16977        15604 :       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
   16978              :     }
   16979              : 
   16980         2344 :   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
   16981         2344 :       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
   16982              :     {
   16983          216 :       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
   16984          216 :       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
   16985              :     }
   16986              : 
   16987         2128 :   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
   16988         2128 :       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
   16989              :     {
   16990          275 :       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
   16991          275 :       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
   16992              :     }
   16993              : 
   16994         1853 :   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
   16995         1853 :       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
   16996              :     {
   16997         1815 :       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
   16998         1815 :       const struct builtin_description *d = bdesc_multi_arg + i;
   16999         1815 :       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
   17000              :                                             (enum ix86_builtin_func_type)
   17001         1815 :                                             d->flag, d->comparison);
   17002              :     }
   17003              : 
   17004           38 :   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
   17005           38 :       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
   17006              :     {
   17007           38 :       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
   17008           38 :       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
   17009           38 :                                                target);
   17010              :     }
   17011              : 
   17012            0 :   gcc_unreachable ();
   17013              : }
   17014              : 
   17015              : /* See below where shifts are handled for explanation of this enum.  */
   17016              : enum ix86_vec_bcast_alg
   17017              : {
   17018              :   VEC_BCAST_PXOR,
   17019              :   VEC_BCAST_PCMPEQ,
   17020              :   VEC_BCAST_PABSB,
   17021              :   VEC_BCAST_PADDB,
   17022              :   VEC_BCAST_PSRLW,
   17023              :   VEC_BCAST_PSRLD,
   17024              :   VEC_BCAST_PSLLW,
   17025              :   VEC_BCAST_PSLLD
   17026              : };
   17027              : 
   17028              : struct ix86_vec_bcast_map_simode_t
   17029              : {
   17030              :   unsigned int key;
   17031              :   enum ix86_vec_bcast_alg alg;
   17032              :   unsigned int arg;
   17033              : };
   17034              : 
   17035              : /* This table must be kept sorted as values are looked-up using bsearch.  */
   17036              : static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = {
   17037              :   { 0x00000000, VEC_BCAST_PXOR,    0 },
   17038              :   { 0x00000001, VEC_BCAST_PSRLD,  31 },
   17039              :   { 0x00000003, VEC_BCAST_PSRLD,  30 },
   17040              :   { 0x00000007, VEC_BCAST_PSRLD,  29 },
   17041              :   { 0x0000000f, VEC_BCAST_PSRLD,  28 },
   17042              :   { 0x0000001f, VEC_BCAST_PSRLD,  27 },
   17043              :   { 0x0000003f, VEC_BCAST_PSRLD,  26 },
   17044              :   { 0x0000007f, VEC_BCAST_PSRLD,  25 },
   17045              :   { 0x000000ff, VEC_BCAST_PSRLD,  24 },
   17046              :   { 0x000001ff, VEC_BCAST_PSRLD,  23 },
   17047              :   { 0x000003ff, VEC_BCAST_PSRLD,  22 },
   17048              :   { 0x000007ff, VEC_BCAST_PSRLD,  21 },
   17049              :   { 0x00000fff, VEC_BCAST_PSRLD,  20 },
   17050              :   { 0x00001fff, VEC_BCAST_PSRLD,  19 },
   17051              :   { 0x00003fff, VEC_BCAST_PSRLD,  18 },
   17052              :   { 0x00007fff, VEC_BCAST_PSRLD,  17 },
   17053              :   { 0x0000ffff, VEC_BCAST_PSRLD,  16 },
   17054              :   { 0x00010001, VEC_BCAST_PSRLW,  15 },
   17055              :   { 0x0001ffff, VEC_BCAST_PSRLD,  15 },
   17056              :   { 0x00030003, VEC_BCAST_PSRLW,  14 },
   17057              :   { 0x0003ffff, VEC_BCAST_PSRLD,  14 },
   17058              :   { 0x00070007, VEC_BCAST_PSRLW,  13 },
   17059              :   { 0x0007ffff, VEC_BCAST_PSRLD,  13 },
   17060              :   { 0x000f000f, VEC_BCAST_PSRLW,  12 },
   17061              :   { 0x000fffff, VEC_BCAST_PSRLD,  12 },
   17062              :   { 0x001f001f, VEC_BCAST_PSRLW,  11 },
   17063              :   { 0x001fffff, VEC_BCAST_PSRLD,  11 },
   17064              :   { 0x003f003f, VEC_BCAST_PSRLW,  10 },
   17065              :   { 0x003fffff, VEC_BCAST_PSRLD,  10 },
   17066              :   { 0x007f007f, VEC_BCAST_PSRLW,   9 },
   17067              :   { 0x007fffff, VEC_BCAST_PSRLD,   9 },
   17068              :   { 0x00ff00ff, VEC_BCAST_PSRLW,   8 },
   17069              :   { 0x00ffffff, VEC_BCAST_PSRLD,   8 },
   17070              :   { 0x01010101, VEC_BCAST_PABSB,   0 },
   17071              :   { 0x01ff01ff, VEC_BCAST_PSRLW,   7 },
   17072              :   { 0x01ffffff, VEC_BCAST_PSRLD,   7 },
   17073              :   { 0x03ff03ff, VEC_BCAST_PSRLW,   6 },
   17074              :   { 0x03ffffff, VEC_BCAST_PSRLD,   6 },
   17075              :   { 0x07ff07ff, VEC_BCAST_PSRLW,   5 },
   17076              :   { 0x07ffffff, VEC_BCAST_PSRLD,   5 },
   17077              :   { 0x0fff0fff, VEC_BCAST_PSRLW,   4 },
   17078              :   { 0x0fffffff, VEC_BCAST_PSRLD,   4 },
   17079              :   { 0x1fff1fff, VEC_BCAST_PSRLW,   3 },
   17080              :   { 0x1fffffff, VEC_BCAST_PSRLD,   3 },
   17081              :   { 0x3fff3fff, VEC_BCAST_PSRLW,   2 },
   17082              :   { 0x3fffffff, VEC_BCAST_PSRLD,   2 },
   17083              :   { 0x7fff7fff, VEC_BCAST_PSRLW,   1 },
   17084              :   { 0x7fffffff, VEC_BCAST_PSRLD,   1 },
   17085              :   { 0x80000000, VEC_BCAST_PSLLD,  31 },
   17086              :   { 0x80008000, VEC_BCAST_PSLLW,  15 },
   17087              :   { 0xc0000000, VEC_BCAST_PSLLD,  30 },
   17088              :   { 0xc000c000, VEC_BCAST_PSLLW,  14 },
   17089              :   { 0xe0000000, VEC_BCAST_PSLLD,  29 },
   17090              :   { 0xe000e000, VEC_BCAST_PSLLW,  13 },
   17091              :   { 0xf0000000, VEC_BCAST_PSLLD,  28 },
   17092              :   { 0xf000f000, VEC_BCAST_PSLLW,  12 },
   17093              :   { 0xf8000000, VEC_BCAST_PSLLD,  27 },
   17094              :   { 0xf800f800, VEC_BCAST_PSLLW,  11 },
   17095              :   { 0xfc000000, VEC_BCAST_PSLLD,  26 },
   17096              :   { 0xfc00fc00, VEC_BCAST_PSLLW,  10 },
   17097              :   { 0xfe000000, VEC_BCAST_PSLLD,  25 },
   17098              :   { 0xfe00fe00, VEC_BCAST_PSLLW,   9 },
   17099              :   { 0xfefefefe, VEC_BCAST_PADDB,   0 },
   17100              :   { 0xff000000, VEC_BCAST_PSLLD,  24 },
   17101              :   { 0xff00ff00, VEC_BCAST_PSLLW,   8 },
   17102              :   { 0xff800000, VEC_BCAST_PSLLD,  23 },
   17103              :   { 0xff80ff80, VEC_BCAST_PSLLW,   7 },
   17104              :   { 0xffc00000, VEC_BCAST_PSLLD,  22 },
   17105              :   { 0xffc0ffc0, VEC_BCAST_PSLLW,   6 },
   17106              :   { 0xffe00000, VEC_BCAST_PSLLD,  21 },
   17107              :   { 0xffe0ffe0, VEC_BCAST_PSLLW,   5 },
   17108              :   { 0xfff00000, VEC_BCAST_PSLLD,  20 },
   17109              :   { 0xfff0fff0, VEC_BCAST_PSLLW,   4 },
   17110              :   { 0xfff80000, VEC_BCAST_PSLLD,  19 },
   17111              :   { 0xfff8fff8, VEC_BCAST_PSLLW,   3 },
   17112              :   { 0xfffc0000, VEC_BCAST_PSLLD,  18 },
   17113              :   { 0xfffcfffc, VEC_BCAST_PSLLW,   2 },
   17114              :   { 0xfffe0000, VEC_BCAST_PSLLD,  17 },
   17115              :   { 0xfffefffe, VEC_BCAST_PSLLW,   1 },
   17116              :   { 0xffff0000, VEC_BCAST_PSLLD,  16 },
   17117              :   { 0xffff8000, VEC_BCAST_PSLLD,  15 },
   17118              :   { 0xffffc000, VEC_BCAST_PSLLD,  14 },
   17119              :   { 0xffffe000, VEC_BCAST_PSLLD,  13 },
   17120              :   { 0xfffff000, VEC_BCAST_PSLLD,  12 },
   17121              :   { 0xfffff800, VEC_BCAST_PSLLD,  11 },
   17122              :   { 0xfffffc00, VEC_BCAST_PSLLD,  10 },
   17123              :   { 0xfffffe00, VEC_BCAST_PSLLD,   9 },
   17124              :   { 0xffffff00, VEC_BCAST_PSLLD,   8 },
   17125              :   { 0xffffff80, VEC_BCAST_PSLLD,   7 },
   17126              :   { 0xffffffc0, VEC_BCAST_PSLLD,   6 },
   17127              :   { 0xffffffe0, VEC_BCAST_PSLLD,   5 },
   17128              :   { 0xfffffff0, VEC_BCAST_PSLLD,   4 },
   17129              :   { 0xfffffff8, VEC_BCAST_PSLLD,   3 },
   17130              :   { 0xfffffffc, VEC_BCAST_PSLLD,   2 },
   17131              :   { 0xfffffffe, VEC_BCAST_PSLLD,   1 },
   17132              :   { 0xffffffff, VEC_BCAST_PCMPEQ,  0 }
   17133              : };
   17134              : 
   17135              : /* Comparator for bsearch on ix86_vec_bcast_map.  */
   17136              : static int
   17137       292957 : ix86_vec_bcast_map_simode_cmp (const void *key, const void *entry)
   17138              : {
   17139       292957 :   return (*(const unsigned int*)key)
   17140       292957 :          - ((const ix86_vec_bcast_map_simode_t*)entry)->key;
   17141              : }
   17142              : 
   17143              : /* A subroutine of ix86_vector_duplicate_value.  Tries to efficiently
   17144              :    materialize V4SImode, V8SImode and V16SImode vectors from SImode
   17145              :    integer constants.  */
   17146              : static bool
   17147        45092 : ix86_vector_duplicate_simode_const (machine_mode mode, rtx target,
   17148              :                                     unsigned int val)
   17149              : {
   17150        45092 :   const ix86_vec_bcast_map_simode_t *entry;
   17151        45092 :   rtx tmp1, tmp2;
   17152              : 
   17153        45092 :   entry = (const ix86_vec_bcast_map_simode_t*)
   17154        45092 :           bsearch(&val, ix86_vec_bcast_map_simode,
   17155              :                   ARRAY_SIZE (ix86_vec_bcast_map_simode),
   17156              :                   sizeof (ix86_vec_bcast_map_simode_t),
   17157              :                   ix86_vec_bcast_map_simode_cmp);
   17158        45092 :   if (!entry)
   17159              :     return false;
   17160              : 
   17161        19359 :   switch (entry->alg)
   17162              :     {
   17163            0 :     case VEC_BCAST_PXOR:
   17164            0 :       if ((mode == V8SImode && !TARGET_AVX2)
   17165            0 :           || (mode == V16SImode && !TARGET_AVX512F))
   17166              :         return false;
   17167            0 :       emit_move_insn (target, CONST0_RTX (mode));
   17168            0 :       return true;
   17169              : 
   17170          156 :     case VEC_BCAST_PCMPEQ:
   17171          156 :       if ((mode == V4SImode && !TARGET_SSE2)
   17172          155 :           || (mode == V8SImode && !TARGET_AVX2)
   17173          128 :           || (mode == V16SImode && !TARGET_AVX512F))
   17174              :         return false;
   17175          128 :       emit_move_insn (target, CONSTM1_RTX (mode));
   17176          128 :       return true;
   17177              : 
   17178          685 :     case VEC_BCAST_PABSB:
   17179          685 :       if (mode == V4SImode && TARGET_SSE2)
   17180              :         {
   17181          547 :           tmp1 = gen_reg_rtx (V16QImode);
   17182          547 :           emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
   17183          547 :           tmp2 = gen_reg_rtx (V16QImode);
   17184          547 :           emit_insn (gen_absv16qi2 (tmp2, tmp1));
   17185              :         }
   17186          138 :       else if (mode == V8SImode && TARGET_AVX2)
   17187              :         {
   17188           80 :           tmp1 = gen_reg_rtx (V32QImode);
   17189           80 :           emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
   17190           80 :           tmp2 = gen_reg_rtx (V32QImode);
   17191           80 :           emit_insn (gen_absv32qi2 (tmp2, tmp1));
   17192              :         }
   17193           58 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17194              :         {
   17195           50 :           tmp1 = gen_reg_rtx (V64QImode);
   17196           50 :           emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
   17197           50 :           tmp2 = gen_reg_rtx (V64QImode);
   17198           50 :           emit_insn (gen_absv64qi2 (tmp2, tmp1));
   17199              :         }
   17200              :       else
   17201              :         return false;
   17202              :       break;
   17203              : 
   17204          104 :     case VEC_BCAST_PADDB:
   17205          104 :       if (mode == V4SImode && TARGET_SSE2)
   17206              :         {
   17207           97 :           tmp1 = gen_reg_rtx (V16QImode);
   17208           97 :           emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
   17209           97 :           tmp2 = gen_reg_rtx (V16QImode);
   17210           97 :           emit_insn (gen_addv16qi3 (tmp2, tmp1, tmp1));
   17211              :         }
   17212            7 :       else if (mode == V8SImode && TARGET_AVX2)
   17213              :         {
   17214            1 :           tmp1 = gen_reg_rtx (V32QImode);
   17215            1 :           emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
   17216            1 :           tmp2 = gen_reg_rtx (V32QImode);
   17217            1 :           emit_insn (gen_addv32qi3 (tmp2, tmp1, tmp1));
   17218              :         }
   17219            6 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17220              :         {
   17221            6 :           tmp1 = gen_reg_rtx (V64QImode);
   17222            6 :           emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
   17223            6 :           tmp2 = gen_reg_rtx (V64QImode);
   17224            6 :           emit_insn (gen_addv64qi3 (tmp2, tmp1, tmp1));
   17225              :         }
   17226              :       else
   17227              :         return false;
   17228              :       break;
   17229              : 
   17230         3752 :     case VEC_BCAST_PSRLW:
   17231         3752 :       if (mode == V4SImode && TARGET_SSE2)
   17232              :         {
   17233         3526 :           tmp1 = gen_reg_rtx (V8HImode);
   17234         3526 :           emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
   17235         3526 :           tmp2 = gen_reg_rtx (V8HImode);
   17236         3526 :           emit_insn (gen_lshrv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17237              :         }
   17238          226 :       else if (mode == V8SImode && TARGET_AVX2)
   17239              :         {
   17240          133 :           tmp1 = gen_reg_rtx (V16HImode);
   17241          133 :           emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
   17242          133 :           tmp2 = gen_reg_rtx (V16HImode);
   17243          133 :           emit_insn (gen_lshrv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17244              :         }
   17245           93 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17246              :         {
   17247           91 :           tmp1 = gen_reg_rtx (V32HImode);
   17248           91 :           emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
   17249           91 :           tmp2 = gen_reg_rtx (V32HImode);
   17250           91 :           emit_insn (gen_lshrv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17251              :         }
   17252              :       else
   17253              :         return false;
   17254              :       break;
   17255              : 
   17256        12966 :     case VEC_BCAST_PSRLD:
   17257        12966 :       if (mode == V4SImode && TARGET_SSE2)
   17258              :         {
   17259        10041 :           tmp1 = gen_reg_rtx (V4SImode);
   17260        10041 :           emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
   17261        10041 :           emit_insn (gen_lshrv4si3 (target, tmp1, GEN_INT (entry->arg)));
   17262        10041 :           return true;
   17263              :         }
   17264         2925 :       else if (mode == V8SImode && TARGET_AVX2)
   17265              :         {
   17266         1097 :           tmp1 = gen_reg_rtx (V8SImode);
   17267         1097 :           emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
   17268         1097 :           emit_insn (gen_lshrv8si3 (target, tmp1, GEN_INT (entry->arg)));
   17269         1097 :           return true;
   17270              :         }
   17271         1828 :       else if (mode == V16SImode && TARGET_AVX512F)
   17272              :         {
   17273          989 :           tmp1 = gen_reg_rtx (V16SImode);
   17274          989 :           emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
   17275          989 :           emit_insn (gen_lshrv16si3 (target, tmp1, GEN_INT (entry->arg)));
   17276          989 :           return true;
   17277              :         }
   17278              :       else
   17279              :         return false;
   17280          126 :       break;
   17281              : 
   17282          126 :     case VEC_BCAST_PSLLW:
   17283          126 :       if (mode == V4SImode && TARGET_SSE2)
   17284              :         {
   17285           96 :           tmp1 = gen_reg_rtx (V8HImode);
   17286           96 :           emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
   17287           96 :           tmp2 = gen_reg_rtx (V8HImode);
   17288           96 :           emit_insn (gen_ashlv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17289              :         }
   17290           30 :       else if (mode == V8SImode && TARGET_AVX2)
   17291              :         {
   17292           21 :           tmp1 = gen_reg_rtx (V16HImode);
   17293           21 :           emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
   17294           21 :           tmp2 = gen_reg_rtx (V16HImode);
   17295           21 :           emit_insn (gen_ashlv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17296              :         }
   17297            9 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17298              :         {
   17299            9 :           tmp1 = gen_reg_rtx (V32HImode);
   17300            9 :           emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
   17301            9 :           tmp2 = gen_reg_rtx (V32HImode);
   17302            9 :           emit_insn (gen_ashlv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17303              :         }
   17304              :       else
   17305              :         return false;
   17306              :       break;
   17307              : 
   17308         1570 :     case VEC_BCAST_PSLLD:
   17309         1570 :       if (mode == V4SImode && TARGET_SSE2)
   17310              :         {
   17311         1538 :           tmp1 = gen_reg_rtx (V4SImode);
   17312         1538 :           emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
   17313         1538 :           emit_insn (gen_ashlv4si3 (target, tmp1, GEN_INT (entry->arg)));
   17314         1538 :           return true;
   17315              :         }
   17316           32 :       else if (mode == V8SImode && TARGET_AVX2)
   17317              :         {
   17318           15 :           tmp1 = gen_reg_rtx (V8SImode);
   17319           15 :           emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
   17320           15 :           emit_insn (gen_ashlv8si3 (target, tmp1, GEN_INT (entry->arg)));
   17321           15 :           return true;
   17322              :         }
   17323           17 :       else if (mode == V16SImode && TARGET_AVX512F)
   17324              :         {
   17325           17 :           tmp1 = gen_reg_rtx (V16SImode);
   17326           17 :           emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
   17327           17 :           emit_insn (gen_ashlv16si3 (target, tmp1, GEN_INT (entry->arg)));
   17328           17 :           return true;
   17329              :         }
   17330              :       else
   17331              :         return false;
   17332              : 
   17333              :     default:
   17334              :       return false;
   17335              :     }
   17336              : 
   17337         4657 :   emit_move_insn (target, gen_lowpart (mode, tmp2));
   17338         4657 :   return true;
   17339              : }
   17340              : 
   17341              : /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
   17342              :    fill target with val via vec_duplicate.  */
   17343              : 
   17344              : static bool
   17345       144867 : ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
   17346              : {
   17347       144867 :   bool ok;
   17348       144867 :   rtx_insn *insn;
   17349       144867 :   rtx dup;
   17350              : 
   17351       144867 :   if ((mode == V4SImode || mode == V8SImode || mode == V16SImode)
   17352        53092 :       && CONST_INT_P (val)
   17353        45092 :       && ix86_vector_duplicate_simode_const (mode, target, INTVAL (val)))
   17354              :     return true;
   17355              : 
   17356              :   /* Save/restore recog_data in case this is called from splitters
   17357              :      or other routines where recog_data needs to stay valid across
   17358              :      force_reg.  See PR106577.  */
   17359       126385 :   recog_data_d recog_data_save = recog_data;
   17360              : 
   17361              :   /* First attempt to recognize VAL as-is.  */
   17362       126385 :   dup = gen_vec_duplicate (mode, val);
   17363       126385 :   insn = emit_insn (gen_rtx_SET (target, dup));
   17364       126385 :   if (recog_memoized (insn) < 0)
   17365              :     {
   17366        88830 :       rtx_insn *seq;
   17367        88830 :       machine_mode innermode = GET_MODE_INNER (mode);
   17368        88830 :       rtx reg;
   17369              : 
   17370              :       /* If that fails, force VAL into a register or mem.  */
   17371              : 
   17372        88830 :       start_sequence ();
   17373              : 
   17374            0 :       if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
   17375            0 :           && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
   17376        88830 :           && GET_MODE_BITSIZE(mode) >= 128)
   17377            0 :         reg = validize_mem (force_const_mem (innermode, val));
   17378              :       else
   17379              :         {
   17380        88830 :           reg = force_reg (innermode, val);
   17381        88830 :           if (GET_MODE (reg) != innermode)
   17382            0 :             reg = gen_lowpart (innermode, reg);
   17383              :         }
   17384              : 
   17385        88830 :       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
   17386        88830 :       seq = end_sequence ();
   17387        88830 :       if (seq)
   17388        88830 :         emit_insn_before (seq, insn);
   17389              : 
   17390        88830 :       ok = recog_memoized (insn) >= 0;
   17391        88830 :       gcc_assert (ok);
   17392              :     }
   17393       126385 :   recog_data = recog_data_save;
   17394       126385 :   return true;
   17395              : }
   17396              : 
   17397              : /* Get a vector mode of the same size as the original but with elements
   17398              :    twice as wide.  This is only guaranteed to apply to integral vectors.  */
   17399              : 
   17400              : static machine_mode
   17401        18868 : get_mode_wider_vector (machine_mode o)
   17402              : {
   17403              :   /* ??? Rely on the ordering that genmodes.cc gives to vectors.  */
   17404        18868 :   machine_mode n = GET_MODE_NEXT_MODE (o).require ();
   17405        56604 :   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
   17406        56604 :   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
   17407        18868 :   return n;
   17408              : }
   17409              : 
   17410              : static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
   17411              : static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
   17412              : 
   17413              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   17414              :    with all elements equal to VAR.  Return true if successful.  */
   17415              : 
   17416              : bool
   17417       165060 : ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
   17418              :                                    rtx target, rtx val)
   17419              : {
   17420       165060 :   bool ok;
   17421              : 
   17422       165060 :   switch (mode)
   17423              :     {
   17424        70108 :     case E_V2DImode:
   17425        70108 :       if (CONST_INT_P (val))
   17426              :         {
   17427        61373 :           int tmp = (int)INTVAL (val);
   17428        61373 :           if (tmp == (int)(INTVAL (val) >> 32))
   17429              :             {
   17430          165 :               rtx reg = gen_reg_rtx (V4SImode);
   17431          165 :               ok = ix86_vector_duplicate_value (V4SImode, reg,
   17432              :                                                 GEN_INT (tmp));
   17433          165 :               if (ok)
   17434              :                 {
   17435          165 :                   emit_move_insn (target, gen_lowpart (V2DImode, reg));
   17436          165 :                   return true;
   17437              :                 }
   17438              :             }
   17439              :         }
   17440        69943 :       return ix86_vector_duplicate_value (mode, target, val);
   17441              : 
   17442         1092 :     case E_V4DImode:
   17443         1092 :       if (CONST_INT_P (val))
   17444              :         {
   17445          781 :           int tmp = (int)INTVAL (val);
   17446          781 :           if (tmp == (int)(INTVAL (val) >> 32))
   17447              :             {
   17448           54 :               rtx reg = gen_reg_rtx (V8SImode);
   17449           54 :               ok = ix86_vector_duplicate_value (V8SImode, reg,
   17450              :                                                 GEN_INT (tmp));
   17451           54 :               if (ok)
   17452              :                 {
   17453           54 :                   emit_move_insn (target, gen_lowpart (V4DImode, reg));
   17454           54 :                   return true;
   17455              :                 }
   17456              :             }
   17457              :         }
   17458         1038 :       return ix86_vector_duplicate_value (mode, target, val);
   17459              : 
   17460          513 :     case E_V8DImode:
   17461          513 :       if (CONST_INT_P (val))
   17462              :         {
   17463          264 :           int tmp = (int)INTVAL (val);
   17464          264 :           if (tmp == (int)(INTVAL (val) >> 32))
   17465              :             {
   17466           24 :               rtx reg = gen_reg_rtx (V16SImode);
   17467           24 :               ok = ix86_vector_duplicate_value (V16SImode, reg,
   17468              :                                                 GEN_INT (tmp));
   17469           24 :               if (ok)
   17470              :                 {
   17471           24 :                   emit_move_insn (target, gen_lowpart (V8DImode, reg));
   17472           24 :                   return true;
   17473              :                 }
   17474              :             }
   17475              :         }
   17476          489 :       return ix86_vector_duplicate_value (mode, target, val);
   17477              : 
   17478         2641 :     case E_V2SImode:
   17479         2641 :     case E_V2SFmode:
   17480         2641 :       if (!mmx_ok)
   17481              :         return false;
   17482              :       /* FALLTHRU */
   17483              : 
   17484        72168 :     case E_V4DFmode:
   17485        72168 :     case E_V8SFmode:
   17486        72168 :     case E_V8SImode:
   17487        72168 :     case E_V2DFmode:
   17488        72168 :     case E_V4SFmode:
   17489        72168 :     case E_V4SImode:
   17490        72168 :     case E_V16SImode:
   17491        72168 :     case E_V16SFmode:
   17492        72168 :     case E_V8DFmode:
   17493        72168 :       return ix86_vector_duplicate_value (mode, target, val);
   17494              : 
   17495          398 :     case E_V4HImode:
   17496          398 :       if (!mmx_ok)
   17497              :         return false;
   17498          395 :       if (TARGET_SSE || TARGET_3DNOW_A)
   17499              :         {
   17500          395 :           rtx x;
   17501              : 
   17502          395 :           val = gen_lowpart (SImode, val);
   17503          395 :           if (CONST_INT_P (val))
   17504              :             return false;
   17505          393 :           x = gen_rtx_TRUNCATE (HImode, val);
   17506          393 :           x = gen_rtx_VEC_DUPLICATE (mode, x);
   17507          393 :           emit_insn (gen_rtx_SET (target, x));
   17508          393 :           return true;
   17509              :         }
   17510            0 :       goto widen;
   17511              : 
   17512            5 :     case E_V4HFmode:
   17513            5 :     case E_V4BFmode:
   17514            5 :       if (TARGET_MMX_WITH_SSE)
   17515              :         {
   17516           10 :           val = force_reg (GET_MODE_INNER (mode), val);
   17517            5 :           rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
   17518            5 :           emit_insn (gen_rtx_SET (target, x));
   17519            5 :           return true;
   17520              :         }
   17521              :       return false;
   17522              : 
   17523          108 :     case E_V2HImode:
   17524          108 :       if (TARGET_SSE2)
   17525              :         {
   17526          108 :           rtx x;
   17527              : 
   17528          108 :           val = gen_lowpart (SImode, val);
   17529          108 :           if (CONST_INT_P (val))
   17530              :             return false;
   17531          108 :           x = gen_rtx_TRUNCATE (HImode, val);
   17532          108 :           x = gen_rtx_VEC_DUPLICATE (mode, x);
   17533          108 :           emit_insn (gen_rtx_SET (target, x));
   17534          108 :           return true;
   17535              :         }
   17536              :       return false;
   17537              : 
   17538            3 :     case E_V2HFmode:
   17539            3 :     case E_V2BFmode:
   17540            3 :       if (TARGET_SSE2)
   17541              :         {
   17542            6 :           val = force_reg (GET_MODE_INNER (mode), val);
   17543            3 :           rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
   17544            3 :           emit_insn (gen_rtx_SET (target, x));
   17545            3 :           return true;
   17546              :         }
   17547              :       return false;
   17548              : 
   17549          297 :     case E_V8QImode:
   17550          297 :     case E_V4QImode:
   17551          297 :       if (!mmx_ok)
   17552              :         return false;
   17553          293 :       goto widen;
   17554              : 
   17555        10199 :     case E_V8HImode:
   17556        10199 :       if (CONST_INT_P (val))
   17557         9678 :         goto widen;
   17558              :       /* FALLTHRU */
   17559              : 
   17560          835 :     case E_V8HFmode:
   17561          835 :     case E_V8BFmode:
   17562          835 :       if (TARGET_AVX2)
   17563          391 :         return ix86_vector_duplicate_value (mode, target, val);
   17564              : 
   17565          444 :       if (TARGET_SSE2)
   17566              :         {
   17567         1140 :           struct expand_vec_perm_d dperm;
   17568         1140 :           rtx tmp1, tmp2;
   17569              : 
   17570          444 :         permute:
   17571         1140 :           memset (&dperm, 0, sizeof (dperm));
   17572         1140 :           dperm.target = target;
   17573         1140 :           dperm.vmode = mode;
   17574         1140 :           dperm.nelt = GET_MODE_NUNITS (mode);
   17575         1140 :           dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
   17576         1140 :           dperm.one_operand_p = true;
   17577              : 
   17578         1140 :           if (mode == V8HFmode || mode == V8BFmode)
   17579              :             {
   17580            3 :               tmp1 = force_reg (GET_MODE_INNER (mode), val);
   17581            3 :               tmp2 = gen_reg_rtx (mode);
   17582            3 :               emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
   17583            3 :               tmp1 = gen_lowpart (mode, tmp2);
   17584              :             }
   17585              :           else
   17586              :             {
   17587              :               /* Extend to SImode using a paradoxical SUBREG.  */
   17588         1137 :               tmp1 = gen_reg_rtx (SImode);
   17589         1137 :               emit_move_insn (tmp1, gen_lowpart (SImode, val));
   17590              : 
   17591              :               /* Insert the SImode value as
   17592              :                  low element of a V4SImode vector.  */
   17593         1137 :               tmp2 = gen_reg_rtx (V4SImode);
   17594         1137 :               emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
   17595         1137 :               tmp1 = gen_lowpart (mode, tmp2);
   17596              :             }
   17597              : 
   17598         1140 :           emit_move_insn (dperm.op0, tmp1);
   17599         1140 :           ok = (expand_vec_perm_1 (&dperm)
   17600         1140 :                 || expand_vec_perm_broadcast_1 (&dperm));
   17601            0 :           gcc_assert (ok);
   17602         1140 :           return ok;
   17603              :         }
   17604            0 :       goto widen;
   17605              : 
   17606         5883 :     case E_V16QImode:
   17607         5883 :       if (CONST_INT_P (val))
   17608         5131 :         goto widen;
   17609          752 :       if (TARGET_AVX2)
   17610           56 :         return ix86_vector_duplicate_value (mode, target, val);
   17611              : 
   17612          696 :       if (TARGET_SSE2)
   17613          696 :         goto permute;
   17614            0 :       goto widen;
   17615              : 
   17616        17306 :     widen:
   17617              :       /* Replicate the value once into the next wider mode and recurse.  */
   17618        17306 :       {
   17619        17306 :         machine_mode smode, wsmode, wvmode;
   17620        17306 :         rtx x;
   17621              : 
   17622        17306 :         smode = GET_MODE_INNER (mode);
   17623        17306 :         wvmode = get_mode_wider_vector (mode);
   17624        17306 :         wsmode = GET_MODE_INNER (wvmode);
   17625              : 
   17626        17306 :         val = convert_modes (wsmode, smode, val, true);
   17627              : 
   17628        17306 :         if (CONST_INT_P (val))
   17629              :           {
   17630        34028 :             x = simplify_binary_operation (ASHIFT, wsmode, val,
   17631        17014 :                                            GEN_INT (GET_MODE_BITSIZE (smode)));
   17632        17014 :             val = simplify_binary_operation (IOR, wsmode, val, x);
   17633              :           }
   17634          292 :         else if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
   17635          292 :           emit_insn (gen_insv_1 (wsmode, val, val));
   17636              :         else
   17637              :           {
   17638            0 :             x = expand_simple_binop (wsmode, ASHIFT, val,
   17639            0 :                                      GEN_INT (GET_MODE_BITSIZE (smode)),
   17640              :                                      NULL_RTX, 1, OPTAB_LIB_WIDEN);
   17641            0 :             val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
   17642              :                                        OPTAB_LIB_WIDEN);
   17643              :           }
   17644              : 
   17645        17306 :         x = gen_reg_rtx (wvmode);
   17646        17306 :         ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
   17647        17306 :         if (!ok)
   17648              :           return false;
   17649        17305 :         emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
   17650        17305 :         return true;
   17651              :       }
   17652              : 
   17653         1435 :     case E_V16HImode:
   17654         1435 :     case E_V32QImode:
   17655         1435 :       if (CONST_INT_P (val))
   17656         1140 :         goto widen;
   17657              :       /* FALLTHRU */
   17658              : 
   17659          378 :     case E_V16HFmode:
   17660          378 :     case E_V16BFmode:
   17661          378 :       if (TARGET_AVX2)
   17662          350 :         return ix86_vector_duplicate_value (mode, target, val);
   17663              :       else
   17664              :         {
   17665           28 :           machine_mode hvmode;
   17666           28 :           switch (mode)
   17667              :             {
   17668              :             case V16HImode:
   17669              :               hvmode = V8HImode;
   17670              :               break;
   17671            0 :             case V16HFmode:
   17672            0 :               hvmode = V8HFmode;
   17673            0 :               break;
   17674            1 :             case V16BFmode:
   17675            1 :               hvmode = V8BFmode;
   17676            1 :               break;
   17677           14 :             case V32QImode:
   17678           14 :               hvmode = V16QImode;
   17679           14 :               break;
   17680            0 :             default:
   17681            0 :               gcc_unreachable ();
   17682              :             }
   17683           28 :           rtx x = gen_reg_rtx (hvmode);
   17684              : 
   17685           28 :           ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
   17686           28 :           if (!ok)
   17687              :             return false;
   17688              : 
   17689           28 :           x = gen_rtx_VEC_CONCAT (mode, x, x);
   17690           28 :           emit_insn (gen_rtx_SET (target, x));
   17691              :         }
   17692           28 :       return true;
   17693              : 
   17694         1194 :     case E_V32HImode:
   17695         1194 :     case E_V64QImode:
   17696         1194 :       if (CONST_INT_P (val))
   17697         1064 :         goto widen;
   17698              :       /* FALLTHRU */
   17699              : 
   17700          209 :     case E_V32HFmode:
   17701          209 :     case E_V32BFmode:
   17702          209 :       if (TARGET_AVX512BW)
   17703          189 :         return ix86_vector_duplicate_value (mode, target, val);
   17704              :       else
   17705              :         {
   17706           20 :           machine_mode hvmode;
   17707           20 :           switch (mode)
   17708              :             {
   17709              :             case V32HImode:
   17710              :               hvmode = V16HImode;
   17711              :               break;
   17712            0 :             case V32HFmode:
   17713            0 :               hvmode = V16HFmode;
   17714            0 :               break;
   17715            1 :             case V32BFmode:
   17716            1 :               hvmode = V16BFmode;
   17717            1 :               break;
   17718           10 :             case V64QImode:
   17719           10 :               hvmode = V32QImode;
   17720           10 :               break;
   17721            0 :             default:
   17722            0 :               gcc_unreachable ();
   17723              :             }
   17724           20 :           rtx x = gen_reg_rtx (hvmode);
   17725              : 
   17726           20 :           ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
   17727           20 :           if (!ok)
   17728              :             return false;
   17729              : 
   17730           20 :           x = gen_rtx_VEC_CONCAT (mode, x, x);
   17731           20 :           emit_insn (gen_rtx_SET (target, x));
   17732              :         }
   17733           20 :       return true;
   17734              : 
   17735              :     default:
   17736              :       return false;
   17737              :     }
   17738              : }
   17739              : 
   17740              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   17741              :    whose ONE_VAR element is VAR, and other elements are zero.  Return true
   17742              :    if successful.  */
   17743              : 
   17744              : bool
   17745        10282 : ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
   17746              :                                      rtx target, rtx var, int one_var)
   17747              : {
   17748        10282 :   machine_mode vsimode;
   17749        10282 :   rtx new_target;
   17750        10282 :   rtx x, tmp;
   17751        10282 :   bool use_vector_set = false;
   17752        10282 :   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
   17753              : 
   17754        10282 :   switch (mode)
   17755              :     {
   17756         7860 :     case E_V2DImode:
   17757              :       /* For SSE4.1, we normally use vector set.  But if the second
   17758              :          element is zero and inter-unit moves are OK, we use movq
   17759              :          instead.  */
   17760         7851 :       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
   17761         7983 :                         && !(TARGET_INTER_UNIT_MOVES_TO_VEC
   17762              :                              && one_var == 0));
   17763              :       break;
   17764          877 :     case E_V16QImode:
   17765          877 :     case E_V4SImode:
   17766          877 :     case E_V4SFmode:
   17767          877 :       use_vector_set = TARGET_SSE4_1;
   17768          877 :       break;
   17769           86 :     case E_V8HImode:
   17770           86 :       use_vector_set = TARGET_SSE2;
   17771           86 :       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
   17772           86 :         ? gen_vec_setv8hi_0 : NULL;
   17773              :       break;
   17774            8 :     case E_V8QImode:
   17775            8 :       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   17776              :       break;
   17777           14 :     case E_V4HImode:
   17778           14 :     case E_V4HFmode:
   17779           14 :     case E_V4BFmode:
   17780           14 :       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
   17781              :       break;
   17782           32 :     case E_V4QImode:
   17783           32 :       use_vector_set = TARGET_SSE4_1;
   17784           32 :       break;
   17785            0 :     case E_V32QImode:
   17786            0 :       use_vector_set = TARGET_AVX;
   17787            0 :       break;
   17788            5 :     case E_V16HImode:
   17789            5 :       use_vector_set = TARGET_AVX;
   17790            5 :       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
   17791            5 :         ? gen_vec_setv16hi_0 : NULL;
   17792              :       break;
   17793            5 :     case E_V8SImode:
   17794            5 :       use_vector_set = TARGET_AVX;
   17795            5 :       gen_vec_set_0 = gen_vec_setv8si_0;
   17796            5 :       break;
   17797           22 :     case E_V8SFmode:
   17798           22 :       use_vector_set = TARGET_AVX;
   17799           22 :       gen_vec_set_0 = gen_vec_setv8sf_0;
   17800           22 :       break;
   17801           13 :     case E_V4DFmode:
   17802           13 :       use_vector_set = TARGET_AVX;
   17803           13 :       gen_vec_set_0 = gen_vec_setv4df_0;
   17804           13 :       break;
   17805            7 :     case E_V4DImode:
   17806              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   17807            7 :       use_vector_set = TARGET_AVX && TARGET_64BIT;
   17808              :       gen_vec_set_0 = gen_vec_setv4di_0;
   17809              :       break;
   17810           17 :     case E_V16SImode:
   17811           17 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17812              :       gen_vec_set_0 = gen_vec_setv16si_0;
   17813              :       break;
   17814           22 :     case E_V16SFmode:
   17815           22 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17816              :       gen_vec_set_0 = gen_vec_setv16sf_0;
   17817              :       break;
   17818            0 :     case E_V8DFmode:
   17819            0 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17820              :       gen_vec_set_0 = gen_vec_setv8df_0;
   17821              :       break;
   17822            2 :     case E_V8DImode:
   17823              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   17824            2 :       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
   17825              :       gen_vec_set_0 = gen_vec_setv8di_0;
   17826              :       break;
   17827           39 :     case E_V8HFmode:
   17828           39 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17829              :       gen_vec_set_0 = gen_vec_setv8hf_0;
   17830              :       break;
   17831            9 :     case E_V16HFmode:
   17832            9 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17833              :       gen_vec_set_0 = gen_vec_setv16hf_0;
   17834              :       break;
   17835            6 :     case E_V32HFmode:
   17836            6 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17837              :       gen_vec_set_0 = gen_vec_setv32hf_0;
   17838              :       break;
   17839            2 :     case E_V8BFmode:
   17840            2 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17841              :       gen_vec_set_0 = gen_vec_setv8bf_0;
   17842              :       break;
   17843            0 :     case E_V16BFmode:
   17844            0 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17845              :       gen_vec_set_0 = gen_vec_setv16bf_0;
   17846              :       break;
   17847            0 :     case E_V32BFmode:
   17848            0 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17849              :       gen_vec_set_0 = gen_vec_setv32bf_0;
   17850              :       break;
   17851            4 :     case E_V32HImode:
   17852            4 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17853              :       gen_vec_set_0 = gen_vec_setv32hi_0;
   17854              :     default:
   17855              :       break;
   17856              :     }
   17857              : 
   17858         8914 :   if (use_vector_set)
   17859              :     {
   17860          862 :       if (gen_vec_set_0 && one_var == 0)
   17861              :         {
   17862          354 :           var = force_reg (GET_MODE_INNER (mode), var);
   17863          177 :           emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
   17864          177 :           return true;
   17865              :         }
   17866          685 :       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
   17867         1370 :       var = force_reg (GET_MODE_INNER (mode), var);
   17868          685 :       ix86_expand_vector_set (mmx_ok, target, var, one_var);
   17869          685 :       return true;
   17870              :     }
   17871              : 
   17872         9420 :   switch (mode)
   17873              :     {
   17874         1155 :     case E_V2SFmode:
   17875         1155 :     case E_V2SImode:
   17876         1155 :       if (!mmx_ok)
   17877              :         return false;
   17878              :       /* FALLTHRU */
   17879              : 
   17880         8098 :     case E_V2DFmode:
   17881         8098 :     case E_V2DImode:
   17882         8098 :       if (one_var != 0)
   17883              :         return false;
   17884         5010 :       var = force_reg (GET_MODE_INNER (mode), var);
   17885         5010 :       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
   17886         2505 :       emit_insn (gen_rtx_SET (target, x));
   17887         2505 :       return true;
   17888              : 
   17889          313 :     case E_V4SFmode:
   17890          313 :     case E_V4SImode:
   17891          313 :       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
   17892            0 :         new_target = gen_reg_rtx (mode);
   17893              :       else
   17894              :         new_target = target;
   17895          626 :       var = force_reg (GET_MODE_INNER (mode), var);
   17896          313 :       x = gen_rtx_VEC_DUPLICATE (mode, var);
   17897          313 :       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
   17898          313 :       emit_insn (gen_rtx_SET (new_target, x));
   17899          313 :       if (one_var != 0)
   17900              :         {
   17901              :           /* We need to shuffle the value to the correct position, so
   17902              :              create a new pseudo to store the intermediate result.  */
   17903              : 
   17904              :           /* With SSE2, we can use the integer shuffle insns.  */
   17905           41 :           if (mode != V4SFmode && TARGET_SSE2)
   17906              :             {
   17907           28 :               emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
   17908              :                                             const1_rtx,
   17909           28 :                                             GEN_INT (one_var == 1 ? 0 : 1),
   17910           28 :                                             GEN_INT (one_var == 2 ? 0 : 1),
   17911           28 :                                             GEN_INT (one_var == 3 ? 0 : 1)));
   17912           28 :               if (target != new_target)
   17913            0 :                 emit_move_insn (target, new_target);
   17914           28 :               return true;
   17915              :             }
   17916              : 
   17917              :           /* Otherwise convert the intermediate result to V4SFmode and
   17918              :              use the SSE1 shuffle instructions.  */
   17919            0 :           if (mode != V4SFmode)
   17920              :             {
   17921            0 :               tmp = gen_reg_rtx (V4SFmode);
   17922            0 :               emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
   17923              :             }
   17924              :           else
   17925              :             tmp = new_target;
   17926              : 
   17927           43 :           emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
   17928              :                                        const1_rtx,
   17929           13 :                                        GEN_INT (one_var == 1 ? 0 : 1),
   17930              :                                        GEN_INT (one_var == 2 ? 0+4 : 1+4),
   17931              :                                        GEN_INT (one_var == 3 ? 0+4 : 1+4)));
   17932              : 
   17933           13 :           if (mode != V4SFmode)
   17934            0 :             emit_move_insn (target, gen_lowpart (V4SImode, tmp));
   17935           13 :           else if (tmp != target)
   17936            0 :             emit_move_insn (target, tmp);
   17937              :         }
   17938          272 :       else if (target != new_target)
   17939            0 :         emit_move_insn (target, new_target);
   17940              :       return true;
   17941              : 
   17942           13 :     case E_V8HImode:
   17943           13 :     case E_V16QImode:
   17944           13 :       vsimode = V4SImode;
   17945           13 :       goto widen;
   17946            3 :     case E_V4HImode:
   17947            3 :     case E_V8QImode:
   17948            3 :       if (!mmx_ok)
   17949              :         return false;
   17950            3 :       vsimode = V2SImode;
   17951            3 :       goto widen;
   17952           16 :     widen:
   17953           16 :       if (one_var != 0)
   17954              :         return false;
   17955              : 
   17956              :       /* Zero extend the variable element to SImode and recurse.  */
   17957           16 :       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
   17958              : 
   17959            8 :       x = gen_reg_rtx (vsimode);
   17960            8 :       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
   17961              :                                                 var, one_var))
   17962            0 :         gcc_unreachable ();
   17963              : 
   17964            8 :       emit_move_insn (target, gen_lowpart (mode, x));
   17965            8 :       return true;
   17966              : 
   17967              :     default:
   17968              :       return false;
   17969              :     }
   17970              : }
   17971              : 
   17972              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   17973              :    consisting of the values in VALS.  It is known that all elements
   17974              :    except ONE_VAR are constants.  Return true if successful.  */
   17975              : 
   17976              : static bool
   17977         7750 : ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
   17978              :                                  rtx target, rtx vals, int one_var)
   17979              : {
   17980         7750 :   rtx var = XVECEXP (vals, 0, one_var);
   17981         7750 :   machine_mode wmode;
   17982         7750 :   rtx const_vec, x;
   17983              : 
   17984         7750 :   const_vec = copy_rtx (vals);
   17985         7750 :   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
   17986         7750 :   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
   17987              : 
   17988         7750 :   switch (mode)
   17989              :     {
   17990              :     case E_V2DFmode:
   17991              :     case E_V2DImode:
   17992              :     case E_V2SFmode:
   17993              :     case E_V2SImode:
   17994              :       /* For the two element vectors, it's just as easy to use
   17995              :          the general case.  */
   17996              :       return false;
   17997              : 
   17998            3 :     case E_V4DImode:
   17999              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   18000            3 :       if (!TARGET_64BIT)
   18001              :         return false;
   18002              :       /* FALLTHRU */
   18003              :     case E_V8HFmode:
   18004              :     case E_V16HFmode:
   18005              :     case E_V8BFmode:
   18006              :     case E_V16BFmode:
   18007              :     case E_V4DFmode:
   18008              :     case E_V8SFmode:
   18009              :     case E_V8SImode:
   18010              :     case E_V16HImode:
   18011              :     case E_V32QImode:
   18012              :     case E_V4SFmode:
   18013              :     case E_V4SImode:
   18014              :     case E_V8HImode:
   18015              :     case E_V4HImode:
   18016              :     case E_V4HFmode:
   18017              :     case E_V4BFmode:
   18018              :       break;
   18019              : 
   18020            8 :     case E_V16QImode:
   18021            8 :       if (TARGET_SSE4_1)
   18022              :         break;
   18023            8 :       wmode = V8HImode;
   18024            8 :       goto widen;
   18025            1 :     case E_V8QImode:
   18026            1 :       if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
   18027              :         break;
   18028            1 :       wmode = V4HImode;
   18029            1 :       goto widen;
   18030           38 :     case E_V4QImode:
   18031           38 :       if (TARGET_SSE4_1)
   18032              :         break;
   18033              :       wmode = V2HImode;
   18034           47 :     widen:
   18035              :       /* There's no way to set one QImode entry easily.  Combine
   18036              :          the variable value with its adjacent constant value, and
   18037              :          promote to an HImode set.  */
   18038           47 :       x = XVECEXP (vals, 0, one_var ^ 1);
   18039           47 :       if (one_var & 1)
   18040              :         {
   18041           13 :           var = convert_modes (HImode, QImode, var, true);
   18042           13 :           var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
   18043              :                                      NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18044           13 :           x = GEN_INT (INTVAL (x) & 0xff);
   18045              :         }
   18046              :       else
   18047              :         {
   18048           34 :           var = convert_modes (HImode, QImode, var, true);
   18049           34 :           x = gen_int_mode (UINTVAL (x) << 8, HImode);
   18050              :         }
   18051           47 :       if (x != const0_rtx)
   18052            7 :         var = expand_simple_binop (HImode, IOR, var, x, var,
   18053              :                                    1, OPTAB_LIB_WIDEN);
   18054              : 
   18055           47 :       x = gen_reg_rtx (wmode);
   18056           47 :       emit_move_insn (x, gen_lowpart (wmode, const_vec));
   18057           47 :       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
   18058              : 
   18059           47 :       emit_move_insn (target, gen_lowpart (mode, x));
   18060           47 :       return true;
   18061              : 
   18062              :     default:
   18063              :       return false;
   18064              :     }
   18065              : 
   18066          191 :   emit_move_insn (target, const_vec);
   18067          191 :   ix86_expand_vector_set (mmx_ok, target, var, one_var);
   18068          191 :   return true;
   18069              : }
   18070              : 
   18071              : /* A subroutine of ix86_expand_vector_init_general.  Use vector
   18072              :    concatenate to handle the most general case: all values variable,
   18073              :    and none identical.  */
   18074              : 
   18075              : static void
   18076       118491 : ix86_expand_vector_init_concat (machine_mode mode,
   18077              :                                 rtx target, rtx *ops, int n)
   18078              : {
   18079       118491 :   machine_mode half_mode = VOIDmode;
   18080       118491 :   rtx half[2];
   18081       118491 :   rtvec v;
   18082       118491 :   int i, j;
   18083              : 
   18084       118491 :   switch (n)
   18085              :     {
   18086       110057 :     case 2:
   18087       110057 :       switch (mode)
   18088              :         {
   18089              :         case E_V32HFmode:
   18090              :           half_mode = V16HFmode;
   18091              :           break;
   18092            0 :         case E_V32BFmode:
   18093            0 :           half_mode = V16BFmode;
   18094            0 :           break;
   18095           81 :         case E_V16SImode:
   18096           81 :           half_mode = V8SImode;
   18097           81 :           break;
   18098           33 :         case E_V16SFmode:
   18099           33 :           half_mode = V8SFmode;
   18100           33 :           break;
   18101           92 :         case E_V8DImode:
   18102           92 :           half_mode = V4DImode;
   18103           92 :           break;
   18104           73 :         case E_V8DFmode:
   18105           73 :           half_mode = V4DFmode;
   18106           73 :           break;
   18107            0 :         case E_V16HFmode:
   18108            0 :           half_mode = V8HFmode;
   18109            0 :           break;
   18110            0 :         case E_V16BFmode:
   18111            0 :           half_mode = V8BFmode;
   18112            0 :           break;
   18113          197 :         case E_V8SImode:
   18114          197 :           half_mode = V4SImode;
   18115          197 :           break;
   18116          271 :         case E_V8SFmode:
   18117          271 :           half_mode = V4SFmode;
   18118          271 :           break;
   18119          308 :         case E_V4DImode:
   18120          308 :           half_mode = V2DImode;
   18121          308 :           break;
   18122          633 :         case E_V4DFmode:
   18123          633 :           half_mode = V2DFmode;
   18124          633 :           break;
   18125         5940 :         case E_V4SImode:
   18126         5940 :           half_mode = V2SImode;
   18127         5940 :           break;
   18128         2273 :         case E_V4SFmode:
   18129         2273 :           half_mode = V2SFmode;
   18130         2273 :           break;
   18131        64065 :         case E_V2DImode:
   18132        64065 :           half_mode = DImode;
   18133        64065 :           break;
   18134        27072 :         case E_V2SImode:
   18135        27072 :           half_mode = SImode;
   18136        27072 :           break;
   18137         3529 :         case E_V2DFmode:
   18138         3529 :           half_mode = DFmode;
   18139         3529 :           break;
   18140         5490 :         case E_V2SFmode:
   18141         5490 :           half_mode = SFmode;
   18142         5490 :           break;
   18143            0 :         default:
   18144            0 :           gcc_unreachable ();
   18145              :         }
   18146              : 
   18147       110057 :       if (!register_operand (ops[1], half_mode))
   18148        47876 :         ops[1] = force_reg (half_mode, ops[1]);
   18149       110057 :       if (!register_operand (ops[0], half_mode))
   18150        36337 :         ops[0] = force_reg (half_mode, ops[0]);
   18151       110057 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
   18152              :                                                           ops[1])));
   18153       110057 :       break;
   18154              : 
   18155         7796 :     case 4:
   18156         7796 :       switch (mode)
   18157              :         {
   18158              :         case E_V4DImode:
   18159              :           half_mode = V2DImode;
   18160              :           break;
   18161          550 :         case E_V4DFmode:
   18162          550 :           half_mode = V2DFmode;
   18163          550 :           break;
   18164         4964 :         case E_V4SImode:
   18165         4964 :           half_mode = V2SImode;
   18166         4964 :           break;
   18167         2100 :         case E_V4SFmode:
   18168         2100 :           half_mode = V2SFmode;
   18169         2100 :           break;
   18170            0 :         default:
   18171            0 :           gcc_unreachable ();
   18172              :         }
   18173         7796 :       goto half;
   18174              : 
   18175          545 :     case 8:
   18176          545 :       switch (mode)
   18177              :         {
   18178              :         case E_V8DImode:
   18179              :           half_mode = V4DImode;
   18180              :           break;
   18181           73 :         case E_V8DFmode:
   18182           73 :           half_mode = V4DFmode;
   18183           73 :           break;
   18184          156 :         case E_V8SImode:
   18185          156 :           half_mode = V4SImode;
   18186          156 :           break;
   18187          265 :         case E_V8SFmode:
   18188          265 :           half_mode = V4SFmode;
   18189          265 :           break;
   18190            0 :         default:
   18191            0 :           gcc_unreachable ();
   18192              :         }
   18193          545 :       goto half;
   18194              : 
   18195           93 :     case 16:
   18196           93 :       switch (mode)
   18197              :         {
   18198              :         case E_V16SImode:
   18199              :           half_mode = V8SImode;
   18200              :           break;
   18201           33 :         case E_V16SFmode:
   18202           33 :           half_mode = V8SFmode;
   18203           33 :           break;
   18204            0 :         default:
   18205            0 :           gcc_unreachable ();
   18206              :         }
   18207           93 :       goto half;
   18208              : 
   18209         8434 : half:
   18210              :       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
   18211         8434 :       i = n - 1;
   18212        25302 :       for (j = 1; j != -1; j--)
   18213              :         {
   18214        16868 :           half[j] = gen_reg_rtx (half_mode);
   18215        16868 :           switch (n >> 1)
   18216              :             {
   18217        15592 :             case 2:
   18218        15592 :               v = gen_rtvec (2, ops[i-1], ops[i]);
   18219        15592 :               i -= 2;
   18220        15592 :               break;
   18221         1090 :             case 4:
   18222         1090 :               v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
   18223         1090 :               i -= 4;
   18224         1090 :               break;
   18225          186 :             case 8:
   18226          372 :               v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
   18227          186 :                              ops[i-3], ops[i-2], ops[i-1], ops[i]);
   18228          186 :               i -= 8;
   18229          186 :               break;
   18230            0 :             default:
   18231            0 :               gcc_unreachable ();
   18232              :             }
   18233        16868 :           ix86_expand_vector_init (false, half[j],
   18234              :                                    gen_rtx_PARALLEL (half_mode, v));
   18235              :         }
   18236              : 
   18237         8434 :       ix86_expand_vector_init_concat (mode, target, half, 2);
   18238         8434 :       break;
   18239              : 
   18240            0 :     default:
   18241            0 :       gcc_unreachable ();
   18242              :     }
   18243       118491 : }
   18244              : 
   18245              : /* A subroutine of ix86_expand_vector_init_general.  Use vector
   18246              :    interleave to handle the most general case: all values variable,
   18247              :    and none identical.  */
   18248              : 
   18249              : static void
   18250         3881 : ix86_expand_vector_init_interleave (machine_mode mode,
   18251              :                                     rtx target, rtx *ops, int n)
   18252              : {
   18253         3881 :   machine_mode first_imode, second_imode, third_imode, inner_mode;
   18254         3881 :   int i, j;
   18255         3881 :   rtx op, op0, op1;
   18256         3881 :   rtx (*gen_load_even) (rtx, rtx, rtx);
   18257         3881 :   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
   18258         3881 :   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
   18259              : 
   18260         3881 :   switch (mode)
   18261              :     {
   18262              :     case E_V8HFmode:
   18263              :       gen_load_even = gen_vec_interleave_lowv8hf;
   18264              :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18265              :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18266              :       inner_mode = HFmode;
   18267              :       first_imode = V4SImode;
   18268              :       second_imode = V2DImode;
   18269              :       third_imode = VOIDmode;
   18270              :       break;
   18271          487 :     case E_V8BFmode:
   18272          487 :       gen_load_even = gen_vec_interleave_lowv8bf;
   18273          487 :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18274          487 :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18275          487 :       inner_mode = BFmode;
   18276          487 :       first_imode = V4SImode;
   18277          487 :       second_imode = V2DImode;
   18278          487 :       third_imode = VOIDmode;
   18279          487 :       break;
   18280          793 :     case E_V8HImode:
   18281          793 :       gen_load_even = gen_vec_setv8hi;
   18282          793 :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18283          793 :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18284          793 :       inner_mode = HImode;
   18285          793 :       first_imode = V4SImode;
   18286          793 :       second_imode = V2DImode;
   18287          793 :       third_imode = VOIDmode;
   18288          793 :       break;
   18289          374 :     case E_V16QImode:
   18290          374 :       gen_load_even = gen_vec_setv16qi;
   18291          374 :       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
   18292          374 :       gen_interleave_second_low = gen_vec_interleave_lowv4si;
   18293          374 :       inner_mode = QImode;
   18294          374 :       first_imode = V8HImode;
   18295          374 :       second_imode = V4SImode;
   18296          374 :       third_imode = V2DImode;
   18297          374 :       break;
   18298            0 :     default:
   18299            0 :       gcc_unreachable ();
   18300              :     }
   18301              : 
   18302        20901 :   for (i = 0; i < n; i++)
   18303              :     {
   18304        17020 :       op = ops [i + i];
   18305        17020 :       if (inner_mode == HFmode || inner_mode == BFmode)
   18306              :         {
   18307        10856 :           rtx even, odd;
   18308              :           /* Use vpuncklwd to pack 2 HFmode or BFmode.  */
   18309         1948 :           machine_mode vec_mode =
   18310        10856 :             (inner_mode == HFmode) ? V8HFmode : V8BFmode;
   18311        10856 :           op0 = gen_reg_rtx (vec_mode);
   18312        10856 :           even = lowpart_subreg (vec_mode,
   18313              :                                  force_reg (inner_mode, op), inner_mode);
   18314        10856 :           odd = lowpart_subreg (vec_mode,
   18315        10856 :                                 force_reg (inner_mode, ops[i + i + 1]),
   18316              :                                 inner_mode);
   18317        10856 :           emit_insn (gen_load_even (op0, even, odd));
   18318              :         }
   18319              :       else
   18320              :         {
   18321              :           /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
   18322         6164 :           op0 = gen_reg_rtx (SImode);
   18323         6164 :           emit_move_insn (op0, gen_lowpart (SImode, op));
   18324              : 
   18325              :           /* Insert the SImode value as low element of V4SImode vector.  */
   18326         6164 :           op1 = gen_reg_rtx (V4SImode);
   18327         6164 :           op0 = gen_rtx_VEC_MERGE (V4SImode,
   18328              :                                    gen_rtx_VEC_DUPLICATE (V4SImode,
   18329              :                                                           op0),
   18330              :                                    CONST0_RTX (V4SImode),
   18331              :                                    const1_rtx);
   18332         6164 :           emit_insn (gen_rtx_SET (op1, op0));
   18333              : 
   18334              :           /* Cast the V4SImode vector back to a vector in orignal mode.  */
   18335         6164 :           op0 = gen_reg_rtx (mode);
   18336         6164 :           emit_move_insn (op0, gen_lowpart (mode, op1));
   18337              : 
   18338              :           /* Load even elements into the second position.  */
   18339         6164 :           emit_insn (gen_load_even (op0,
   18340              :                                     force_reg (inner_mode,
   18341         6164 :                                                ops[i + i + 1]),
   18342              :                                     const1_rtx));
   18343              :         }
   18344              : 
   18345              :       /* Cast vector to FIRST_IMODE vector.  */
   18346        17020 :       ops[i] = gen_reg_rtx (first_imode);
   18347        17020 :       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
   18348              :     }
   18349              : 
   18350              :   /* Interleave low FIRST_IMODE vectors.  */
   18351        12391 :   for (i = j = 0; i < n; i += 2, j++)
   18352              :     {
   18353         8510 :       op0 = gen_reg_rtx (first_imode);
   18354         8510 :       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
   18355              : 
   18356              :       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
   18357         8510 :       ops[j] = gen_reg_rtx (second_imode);
   18358         8510 :       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
   18359              :     }
   18360              : 
   18361              :   /* Interleave low SECOND_IMODE vectors.  */
   18362         3881 :   switch (second_imode)
   18363              :     {
   18364              :     case E_V4SImode:
   18365         1122 :       for (i = j = 0; i < n / 2; i += 2, j++)
   18366              :         {
   18367          748 :           op0 = gen_reg_rtx (second_imode);
   18368          748 :           emit_insn (gen_interleave_second_low (op0, ops[i],
   18369          748 :                                                 ops[i + 1]));
   18370              : 
   18371              :           /* Cast the SECOND_IMODE vector to the THIRD_IMODE
   18372              :              vector.  */
   18373          748 :           ops[j] = gen_reg_rtx (third_imode);
   18374          748 :           emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
   18375              :         }
   18376              :       second_imode = V2DImode;
   18377              :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18378              :       /* FALLTHRU */
   18379              : 
   18380         3881 :     case E_V2DImode:
   18381         3881 :       op0 = gen_reg_rtx (second_imode);
   18382         3881 :       emit_insn (gen_interleave_second_low (op0, ops[0],
   18383              :                                             ops[1]));
   18384              : 
   18385              :       /* Cast the SECOND_IMODE vector back to a vector on original
   18386              :          mode.  */
   18387         3881 :       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
   18388         3881 :       break;
   18389              : 
   18390              :     default:
   18391              :       gcc_unreachable ();
   18392              :     }
   18393         3881 : }
   18394              : 
   18395              : /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
   18396              :    all values variable, and none identical.  */
   18397              : 
   18398              : static void
   18399       119031 : ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
   18400              :                                  rtx target, rtx vals)
   18401              : {
   18402       119031 :   rtx ops[64], op0, op1, op2, op3, op4, op5;
   18403       119031 :   machine_mode half_mode = VOIDmode;
   18404       119031 :   machine_mode quarter_mode = VOIDmode;
   18405       119031 :   machine_mode int_inner_mode = VOIDmode;
   18406       119031 :   int n, i;
   18407              : 
   18408       119031 :   switch (mode)
   18409              :     {
   18410        32562 :     case E_V2SFmode:
   18411        32562 :     case E_V2SImode:
   18412        32562 :       if (!mmx_ok && !TARGET_SSE)
   18413              :         break;
   18414              :       /* FALLTHRU */
   18415              : 
   18416       108590 :     case E_V16SImode:
   18417       108590 :     case E_V16SFmode:
   18418       108590 :     case E_V8DFmode:
   18419       108590 :     case E_V8DImode:
   18420       108590 :     case E_V8SFmode:
   18421       108590 :     case E_V8SImode:
   18422       108590 :     case E_V4DFmode:
   18423       108590 :     case E_V4DImode:
   18424       108590 :     case E_V4SFmode:
   18425       108590 :     case E_V4SImode:
   18426       108590 :     case E_V2DFmode:
   18427       108590 :     case E_V2DImode:
   18428       108590 :       n = GET_MODE_NUNITS (mode);
   18429       345934 :       for (i = 0; i < n; i++)
   18430       237344 :         ops[i] = XVECEXP (vals, 0, i);
   18431       108590 :       ix86_expand_vector_init_concat (mode, target, ops, n);
   18432       219285 :       return;
   18433              : 
   18434              :     case E_V2TImode:
   18435          135 :       for (i = 0; i < 2; i++)
   18436           90 :         ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
   18437           45 :       op0 = gen_reg_rtx (V4DImode);
   18438           45 :       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
   18439           45 :       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
   18440           45 :       return;
   18441              : 
   18442              :     case E_V4TImode:
   18443          195 :       for (i = 0; i < 4; i++)
   18444          156 :         ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
   18445           39 :       ops[4] = gen_reg_rtx (V4DImode);
   18446           39 :       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
   18447           39 :       ops[5] = gen_reg_rtx (V4DImode);
   18448           39 :       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
   18449           39 :       op0 = gen_reg_rtx (V8DImode);
   18450           39 :       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
   18451           39 :       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
   18452           39 :       return;
   18453              : 
   18454           69 :     case E_V32QImode:
   18455           69 :       half_mode = V16QImode;
   18456           69 :       goto half;
   18457              : 
   18458           64 :     case E_V16HImode:
   18459           64 :       half_mode = V8HImode;
   18460           64 :       goto half;
   18461              : 
   18462          237 :     case E_V16HFmode:
   18463          237 :       half_mode = V8HFmode;
   18464          237 :       goto half;
   18465              : 
   18466           95 :     case E_V16BFmode:
   18467           95 :       half_mode = V8BFmode;
   18468           95 :       goto half;
   18469              : 
   18470          465 : half:
   18471          465 :       n = GET_MODE_NUNITS (mode);
   18472         9009 :       for (i = 0; i < n; i++)
   18473         8544 :         ops[i] = XVECEXP (vals, 0, i);
   18474          465 :       op0 = gen_reg_rtx (half_mode);
   18475          465 :       op1 = gen_reg_rtx (half_mode);
   18476          465 :       ix86_expand_vector_init_interleave (half_mode, op0, ops,
   18477              :                                           n >> 2);
   18478          465 :       ix86_expand_vector_init_interleave (half_mode, op1,
   18479          465 :                                           &ops [n >> 1], n >> 2);
   18480          465 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
   18481          465 :       return;
   18482              : 
   18483           56 :     case E_V64QImode:
   18484           56 :       quarter_mode = V16QImode;
   18485           56 :       half_mode = V32QImode;
   18486           56 :       goto quarter;
   18487              : 
   18488           71 :     case E_V32HImode:
   18489           71 :       quarter_mode = V8HImode;
   18490           71 :       half_mode = V16HImode;
   18491           71 :       goto quarter;
   18492              : 
   18493          287 :     case E_V32HFmode:
   18494          287 :       quarter_mode = V8HFmode;
   18495          287 :       half_mode = V16HFmode;
   18496          287 :       goto quarter;
   18497              : 
   18498           51 :     case E_V32BFmode:
   18499           51 :       quarter_mode = V8BFmode;
   18500           51 :       half_mode = V16BFmode;
   18501           51 :       goto quarter;
   18502              : 
   18503          465 : quarter:
   18504          465 :       n = GET_MODE_NUNITS (mode);
   18505        17137 :       for (i = 0; i < n; i++)
   18506        16672 :         ops[i] = XVECEXP (vals, 0, i);
   18507          465 :       op0 = gen_reg_rtx (quarter_mode);
   18508          465 :       op1 = gen_reg_rtx (quarter_mode);
   18509          465 :       op2 = gen_reg_rtx (quarter_mode);
   18510          465 :       op3 = gen_reg_rtx (quarter_mode);
   18511          465 :       op4 = gen_reg_rtx (half_mode);
   18512          465 :       op5 = gen_reg_rtx (half_mode);
   18513          465 :       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
   18514              :                                           n >> 3);
   18515          465 :       ix86_expand_vector_init_interleave (quarter_mode, op1,
   18516          465 :                                           &ops [n >> 2], n >> 3);
   18517          465 :       ix86_expand_vector_init_interleave (quarter_mode, op2,
   18518          465 :                                           &ops [n >> 1], n >> 3);
   18519          465 :       ix86_expand_vector_init_interleave (quarter_mode, op3,
   18520          465 :                                           &ops [(n >> 1) | (n >> 2)], n >> 3);
   18521          465 :       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
   18522          465 :       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
   18523          465 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
   18524          465 :       return;
   18525              : 
   18526          326 :     case E_V16QImode:
   18527          326 :       if (!TARGET_SSE4_1)
   18528              :         break;
   18529              :       /* FALLTHRU */
   18530              : 
   18531          517 :     case E_V8HImode:
   18532          517 :       if (!TARGET_SSE2)
   18533              :         break;
   18534              : 
   18535              :       /* Don't use ix86_expand_vector_init_interleave if we can't
   18536              :          move from GPR to SSE register directly.  */
   18537          517 :       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
   18538              :         break;
   18539              :       /* FALLTHRU */
   18540              : 
   18541         1091 :     case E_V8HFmode:
   18542         1091 :     case E_V8BFmode:
   18543              : 
   18544         1091 :       n = GET_MODE_NUNITS (mode);
   18545         9915 :       for (i = 0; i < n; i++)
   18546         8824 :         ops[i] = XVECEXP (vals, 0, i);
   18547         1091 :       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
   18548         1091 :       return;
   18549              : 
   18550              :     case E_V4HFmode:
   18551              :     case E_V4BFmode:
   18552              :     case E_V2HFmode:
   18553              :     case E_V2BFmode:
   18554         8336 :       int_inner_mode = HImode;
   18555              :       break;
   18556              : 
   18557              :     case E_V4HImode:
   18558              :     case E_V8QImode:
   18559              : 
   18560              :     case E_V2HImode:
   18561              :     case E_V4QImode:
   18562              :       break;
   18563              : 
   18564            0 :     default:
   18565            0 :       gcc_unreachable ();
   18566              :     }
   18567              : 
   18568         8336 :     {
   18569         8336 :       int i, j, n_elts, n_words, n_elt_per_word;
   18570         8336 :       machine_mode tmp_mode, inner_mode;
   18571         8336 :       rtx words[4], shift;
   18572              : 
   18573        16749 :       tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
   18574              : 
   18575         8336 :       inner_mode = GET_MODE_INNER (mode);
   18576         8336 :       n_elts = GET_MODE_NUNITS (mode);
   18577        16672 :       n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
   18578         8336 :       n_elt_per_word = n_elts / n_words;
   18579         8336 :       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
   18580              : 
   18581        17064 :       for (i = 0; i < n_words; ++i)
   18582              :         {
   18583              :           rtx word = NULL_RTX;
   18584              : 
   18585        46292 :           for (j = 0; j < n_elt_per_word; ++j)
   18586              :             {
   18587        37564 :               rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
   18588        37564 :               if (int_inner_mode != E_VOIDmode)
   18589              :                 {
   18590          138 :                   gcc_assert (TARGET_SSE2 && int_inner_mode == HImode);
   18591          138 :                   rtx tmp = gen_reg_rtx (int_inner_mode);
   18592          138 :                   elt = lowpart_subreg (int_inner_mode,
   18593              :                                         force_reg (inner_mode, elt),
   18594              :                                         inner_mode);
   18595          138 :                   emit_move_insn (tmp, elt);
   18596          138 :                   elt = tmp;
   18597              :                 }
   18598        37564 :               elt = convert_modes (tmp_mode, inner_mode, elt, true);
   18599              : 
   18600        37564 :               if (j == 0)
   18601              :                 word = elt;
   18602              :               else
   18603              :                 {
   18604        28836 :                   word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
   18605              :                                               NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18606        28836 :                   word = expand_simple_binop (tmp_mode, IOR, word, elt,
   18607              :                                               NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18608              :                 }
   18609              :             }
   18610              : 
   18611         8728 :           words[i] = word;
   18612              :         }
   18613              : 
   18614         8336 :       if (n_words == 1)
   18615         7944 :         emit_move_insn (target, gen_lowpart (mode, words[0]));
   18616          392 :       else if (n_words == 2)
   18617              :         {
   18618          392 :           gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
   18619          392 :           machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
   18620          392 :           rtx tmp = gen_reg_rtx (concat_mode);
   18621          392 :           vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
   18622          392 :           ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
   18623          392 :           emit_move_insn (target, gen_lowpart (mode, tmp));
   18624              :         }
   18625            0 :       else if (n_words == 4)
   18626              :         {
   18627            0 :           rtx tmp = gen_reg_rtx (V4SImode);
   18628            0 :           gcc_assert (tmp_mode == SImode);
   18629            0 :           vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
   18630            0 :           ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
   18631            0 :           emit_move_insn (target, gen_lowpart (mode, tmp));
   18632              :         }
   18633              :       else
   18634            0 :         gcc_unreachable ();
   18635              :     }
   18636              : }
   18637              : 
   18638              : /* Initialize vector TARGET via VALS.  Suppress the use of MMX
   18639              :    instructions unless MMX_OK is true.  */
   18640              : 
   18641              : void
   18642       130291 : ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
   18643              : {
   18644       130291 :   machine_mode mode = GET_MODE (target);
   18645       130291 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   18646       130291 :   int n_elts = GET_MODE_NUNITS (mode);
   18647       130291 :   int n_var = 0, one_var = -1;
   18648       130291 :   bool all_same = true, all_const_zero = true;
   18649       130291 :   int i;
   18650       130291 :   rtx x;
   18651              : 
   18652              :   /* Handle first initialization from vector elts.  */
   18653       130291 :   if (n_elts != XVECLEN (vals, 0))
   18654              :     {
   18655         1305 :       rtx subtarget = target;
   18656         1305 :       x = XVECEXP (vals, 0, 0);
   18657         2610 :       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
   18658         2610 :       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
   18659              :         {
   18660         1305 :           rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
   18661         1305 :           if (inner_mode == QImode
   18662         1305 :               || inner_mode == HImode
   18663         1305 :               || inner_mode == TImode
   18664              :               || inner_mode == HFmode
   18665              :               || inner_mode == BFmode)
   18666              :             {
   18667          148 :               unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
   18668          148 :               scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
   18669          148 :               n_bits /= GET_MODE_SIZE (elt_mode);
   18670          148 :               mode = mode_for_vector (elt_mode, n_bits).require ();
   18671          148 :               inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
   18672          148 :               ops[0] = gen_lowpart (inner_mode, ops[0]);
   18673          148 :               ops[1] = gen_lowpart (inner_mode, ops[1]);
   18674          148 :               subtarget = gen_reg_rtx (mode);
   18675              :             }
   18676         1305 :           ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
   18677         1305 :           if (subtarget != target)
   18678          148 :             emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
   18679         1305 :           return;
   18680              :         }
   18681            0 :       gcc_unreachable ();
   18682              :     }
   18683              : 
   18684       475118 :   for (i = 0; i < n_elts; ++i)
   18685              :     {
   18686       346132 :       x = XVECEXP (vals, 0, i);
   18687       672018 :       if (!(CONST_SCALAR_INT_P (x)
   18688       329863 :             || CONST_DOUBLE_P (x)
   18689              :             || CONST_FIXED_P (x)))
   18690       325886 :         n_var++, one_var = i;
   18691        20246 :       else if (x != CONST0_RTX (inner_mode))
   18692         3257 :         all_const_zero = false;
   18693       346132 :       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
   18694              :         all_same = false;
   18695              :     }
   18696              : 
   18697              :   /* Handle the zero vector as special case.  */
   18698       128986 :   if (n_var == 0 && all_const_zero)
   18699              :     {
   18700          302 :       emit_move_insn (target, CONST0_RTX (mode));
   18701          302 :       return;
   18702              :     }
   18703              : 
   18704              :   /* If all values are identical, broadcast the value.  */
   18705       128684 :   if (all_same
   18706       135960 :       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
   18707         7276 :                                             XVECEXP (vals, 0, 0)))
   18708              :     return;
   18709              : 
   18710              :   /* Constants are best loaded from the constant pool.  */
   18711       122598 :   if (n_var == 0)
   18712              :     {
   18713           41 :       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
   18714           41 :       return;
   18715              :     }
   18716              : 
   18717              :   /* Values where only one field is non-constant are best loaded from
   18718              :      the pool and overwritten via move later.  */
   18719       122557 :   if (n_var == 1)
   18720              :     {
   18721        11430 :       if (all_const_zero
   18722        21704 :           && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
   18723        10274 :                                                   XVECEXP (vals, 0, one_var),
   18724              :                                                   one_var))
   18725              :         return;
   18726              : 
   18727         7750 :       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
   18728              :         return;
   18729              :     }
   18730              : 
   18731       118639 :   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
   18732              : }
   18733              : 
   18734              : /* Implemented as
   18735              :    V setg (V v, int idx, T val)
   18736              :    {
   18737              :      V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
   18738              :      V valv = (V){val, val, val, val, val, val, val, val};
   18739              :      V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
   18740              :      v = (v & ~mask) | (valv & mask);
   18741              :      return v;
   18742              :    }.  */
   18743              : void
   18744          129 : ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
   18745              : {
   18746          129 :   rtx vec[64];
   18747          129 :   machine_mode mode = GET_MODE (target);
   18748          129 :   machine_mode cmp_mode = mode;
   18749          129 :   int n_elts = GET_MODE_NUNITS (mode);
   18750          129 :   rtx valv,idxv,constv,idx_tmp;
   18751          129 :   bool ok = false;
   18752              : 
   18753              :   /* 512-bits vector byte/word broadcast and comparison only available
   18754              :      under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
   18755              :      when without TARGET_AVX512BW.  */
   18756          129 :   if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
   18757          123 :        || mode == V64QImode)
   18758           10 :       && !TARGET_AVX512BW)
   18759              :     {
   18760            3 :       gcc_assert (TARGET_AVX512F);
   18761            3 :       rtx vhi, vlo, idx_hi;
   18762            3 :       machine_mode half_mode;
   18763            3 :       rtx (*extract_hi)(rtx, rtx);
   18764            3 :       rtx (*extract_lo)(rtx, rtx);
   18765              : 
   18766            3 :       if (mode == V32HImode)
   18767              :         {
   18768              :           half_mode = V16HImode;
   18769              :           extract_hi = gen_vec_extract_hi_v32hi;
   18770              :           extract_lo = gen_vec_extract_lo_v32hi;
   18771              :         }
   18772              :       else if (mode == V32HFmode)
   18773              :         {
   18774              :           half_mode = V16HFmode;
   18775              :           extract_hi = gen_vec_extract_hi_v32hf;
   18776              :           extract_lo = gen_vec_extract_lo_v32hf;
   18777              :         }
   18778              :       else if (mode == V32BFmode)
   18779              :         {
   18780              :           half_mode = V16BFmode;
   18781              :           extract_hi = gen_vec_extract_hi_v32bf;
   18782              :           extract_lo = gen_vec_extract_lo_v32bf;
   18783              :         }
   18784              :       else
   18785              :         {
   18786            3 :           half_mode = V32QImode;
   18787            3 :           extract_hi = gen_vec_extract_hi_v64qi;
   18788            3 :           extract_lo = gen_vec_extract_lo_v64qi;
   18789              :         }
   18790              : 
   18791            3 :       vhi = gen_reg_rtx (half_mode);
   18792            3 :       vlo = gen_reg_rtx (half_mode);
   18793            3 :       idx_hi = gen_reg_rtx (GET_MODE (idx));
   18794            3 :       emit_insn (extract_hi (vhi, target));
   18795            3 :       emit_insn (extract_lo (vlo, target));
   18796            3 :       vec[0] = idx_hi;
   18797            3 :       vec[1] = idx;
   18798            3 :       vec[2] = GEN_INT (n_elts/2);
   18799            3 :       ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
   18800            3 :       ix86_expand_vector_set_var (vhi, val, idx_hi);
   18801            3 :       ix86_expand_vector_set_var (vlo, val, idx);
   18802            3 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
   18803            3 :       return;
   18804              :     }
   18805              : 
   18806          504 :   if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
   18807              :     {
   18808           42 :       switch (mode)
   18809              :         {
   18810              :         case E_V2DFmode:
   18811              :           cmp_mode = V2DImode;
   18812              :           break;
   18813            6 :         case E_V4DFmode:
   18814            6 :           cmp_mode = V4DImode;
   18815            6 :           break;
   18816            4 :         case E_V8DFmode:
   18817            4 :           cmp_mode = V8DImode;
   18818            4 :           break;
   18819            2 :         case E_V2SFmode:
   18820            2 :           cmp_mode = V2SImode;
   18821            2 :           break;
   18822            6 :         case E_V4SFmode:
   18823            6 :           cmp_mode = V4SImode;
   18824            6 :           break;
   18825            6 :         case E_V8SFmode:
   18826            6 :           cmp_mode = V8SImode;
   18827            6 :           break;
   18828            5 :         case E_V16SFmode:
   18829            5 :           cmp_mode = V16SImode;
   18830            5 :           break;
   18831            1 :         case E_V2HFmode:
   18832            1 :         case E_V2BFmode:
   18833            1 :           cmp_mode = V2HImode;
   18834            1 :           break;
   18835            1 :         case E_V4HFmode:
   18836            1 :         case E_V4BFmode:
   18837            1 :           cmp_mode = V4HImode;
   18838            1 :           break;
   18839              :         case E_V8HFmode:
   18840            2 :           cmp_mode = V8HImode;
   18841              :           break;
   18842              :         case E_V16HFmode:
   18843            2 :           cmp_mode = V16HImode;
   18844              :           break;
   18845              :         case E_V32HFmode:
   18846            1 :           cmp_mode = V32HImode;
   18847              :           break;
   18848              :         case E_V8BFmode:
   18849            2 :           cmp_mode = V8HImode;
   18850              :           break;
   18851              :         case E_V16BFmode:
   18852            2 :           cmp_mode = V16HImode;
   18853              :           break;
   18854              :         case E_V32BFmode:
   18855            1 :           cmp_mode = V32HImode;
   18856              :           break;
   18857            0 :         default:
   18858            0 :           gcc_unreachable ();
   18859              :         }
   18860              :     }
   18861              : 
   18862         1604 :   for (int i = 0; i != n_elts; i++)
   18863         1478 :     vec[i] = GEN_INT (i);
   18864          126 :   constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
   18865          126 :   valv = gen_reg_rtx (mode);
   18866          126 :   idxv = gen_reg_rtx (cmp_mode);
   18867          252 :   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
   18868              : 
   18869          126 :   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
   18870              :                                           mode, valv, val);
   18871          126 :   gcc_assert (ok);
   18872          126 :   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
   18873              :                                           cmp_mode, idxv, idx_tmp);
   18874          126 :   gcc_assert (ok);
   18875          126 :   vec[0] = target;
   18876          126 :   vec[1] = valv;
   18877          126 :   vec[2] = target;
   18878          126 :   vec[3] = gen_rtx_EQ (mode, idxv, constv);
   18879          126 :   vec[4] = idxv;
   18880          126 :   vec[5] = constv;
   18881          126 :   ok = ix86_expand_int_vcond (vec);
   18882          126 :   gcc_assert (ok);
   18883              : }
   18884              : 
   18885              : void
   18886         8401 : ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
   18887              : {
   18888         8401 :   machine_mode mode = GET_MODE (target);
   18889         8401 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   18890         8401 :   machine_mode half_mode;
   18891         8401 :   bool use_vec_merge = false;
   18892         8401 :   bool blendm_const = false;
   18893         8401 :   rtx tmp;
   18894         8401 :   static rtx (*gen_extract[8][2]) (rtx, rtx)
   18895              :     = {
   18896              :         { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
   18897              :         { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
   18898              :         { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
   18899              :         { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
   18900              :         { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
   18901              :         { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
   18902              :         { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
   18903              :         { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
   18904              :       };
   18905         8401 :   static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
   18906              :     = {
   18907              :         { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
   18908              :         { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
   18909              :         { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
   18910              :         { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
   18911              :         { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
   18912              :         { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
   18913              :         { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
   18914              :         { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
   18915              :       };
   18916         8401 :   int i, j, n;
   18917         8401 :   machine_mode mmode = VOIDmode;
   18918         8401 :   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
   18919              : 
   18920         8401 :   switch (mode)
   18921              :     {
   18922          187 :     case E_V2SImode:
   18923          187 :       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   18924              :       if (use_vec_merge)
   18925              :         break;
   18926              :       /* FALLTHRU */
   18927              : 
   18928          168 :     case E_V2SFmode:
   18929          168 :       if (mmx_ok)
   18930              :         {
   18931          168 :           tmp = gen_reg_rtx (GET_MODE_INNER (mode));
   18932          168 :           ix86_expand_vector_extract (true, tmp, target, 1 - elt);
   18933          168 :           if (elt == 0)
   18934            1 :             tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
   18935              :           else
   18936          167 :             tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
   18937          168 :           emit_insn (gen_rtx_SET (target, tmp));
   18938          168 :           return;
   18939              :         }
   18940              :       break;
   18941              : 
   18942          251 :     case E_V2DImode:
   18943          251 :       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
   18944          109 :       if (use_vec_merge)
   18945              :         break;
   18946              : 
   18947          109 :       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
   18948          109 :       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
   18949          109 :       if (elt == 0)
   18950           77 :         tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
   18951              :       else
   18952           32 :         tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
   18953          109 :       emit_insn (gen_rtx_SET (target, tmp));
   18954          109 :       return;
   18955              : 
   18956          153 :     case E_V2DFmode:
   18957              :       /* NB: For ELT == 0, use standard scalar operation patterns which
   18958              :          preserve the rest of the vector for combiner:
   18959              : 
   18960              :          (vec_merge:V2DF
   18961              :            (vec_duplicate:V2DF (reg:DF))
   18962              :            (reg:V2DF)
   18963              :            (const_int 1))
   18964              :        */
   18965          153 :       if (elt == 0)
   18966           68 :         goto do_vec_merge;
   18967              : 
   18968           85 :       {
   18969           85 :         rtx op0, op1;
   18970              : 
   18971              :         /* For the two element vectors, we implement a VEC_CONCAT with
   18972              :            the extraction of the other element.  */
   18973              : 
   18974           85 :         tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
   18975           85 :         tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
   18976              : 
   18977           85 :         if (elt == 0)
   18978              :           op0 = val, op1 = tmp;
   18979              :         else
   18980           85 :           op0 = tmp, op1 = val;
   18981              : 
   18982           85 :         tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
   18983           85 :         emit_insn (gen_rtx_SET (target, tmp));
   18984              :       }
   18985           85 :       return;
   18986              : 
   18987          574 :     case E_V4SFmode:
   18988          574 :       use_vec_merge = TARGET_SSE4_1;
   18989          574 :       if (use_vec_merge)
   18990              :         break;
   18991              : 
   18992           62 :       switch (elt)
   18993              :         {
   18994              :         case 0:
   18995              :           use_vec_merge = true;
   18996              :           break;
   18997              : 
   18998            1 :         case 1:
   18999              :           /* tmp = target = A B C D */
   19000            1 :           tmp = copy_to_reg (target);
   19001              :           /* target = A A B B */
   19002            1 :           emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
   19003              :           /* target = X A B B */
   19004            1 :           ix86_expand_vector_set (false, target, val, 0);
   19005              :           /* target = A X C D  */
   19006            1 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19007              :                                           const1_rtx, const0_rtx,
   19008              :                                           GEN_INT (2+4), GEN_INT (3+4)));
   19009            1 :           return;
   19010              : 
   19011            0 :         case 2:
   19012              :           /* tmp = target = A B C D */
   19013            0 :           tmp = copy_to_reg (target);
   19014              :           /* tmp = X B C D */
   19015            0 :           ix86_expand_vector_set (false, tmp, val, 0);
   19016              :           /* target = A B X D */
   19017            0 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19018              :                                           const0_rtx, const1_rtx,
   19019              :                                           GEN_INT (0+4), GEN_INT (3+4)));
   19020            0 :           return;
   19021              : 
   19022            4 :         case 3:
   19023              :           /* tmp = target = A B C D */
   19024            4 :           tmp = copy_to_reg (target);
   19025              :           /* tmp = X B C D */
   19026            4 :           ix86_expand_vector_set (false, tmp, val, 0);
   19027              :           /* target = A B X D */
   19028            4 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19029              :                                           const0_rtx, const1_rtx,
   19030              :                                           GEN_INT (2+4), GEN_INT (0+4)));
   19031            4 :           return;
   19032              : 
   19033            0 :         default:
   19034            0 :           gcc_unreachable ();
   19035              :         }
   19036              :       break;
   19037              : 
   19038          440 :     case E_V4SImode:
   19039          440 :       use_vec_merge = TARGET_SSE4_1;
   19040          440 :       if (use_vec_merge)
   19041              :         break;
   19042              : 
   19043              :       /* Element 0 handled by vec_merge below.  */
   19044          280 :       if (elt == 0)
   19045              :         {
   19046              :           use_vec_merge = true;
   19047              :           break;
   19048              :         }
   19049              : 
   19050           88 :       if (TARGET_SSE2)
   19051              :         {
   19052              :           /* With SSE2, use integer shuffles to swap element 0 and ELT,
   19053              :              store into element 0, then shuffle them back.  */
   19054              : 
   19055           88 :           rtx order[4];
   19056              : 
   19057           88 :           order[0] = GEN_INT (elt);
   19058           88 :           order[1] = const1_rtx;
   19059           88 :           order[2] = const2_rtx;
   19060           88 :           order[3] = GEN_INT (3);
   19061           88 :           order[elt] = const0_rtx;
   19062              : 
   19063           88 :           emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
   19064              :                                         order[1], order[2], order[3]));
   19065              : 
   19066           88 :           ix86_expand_vector_set (false, target, val, 0);
   19067              : 
   19068           88 :           emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
   19069              :                                         order[1], order[2], order[3]));
   19070              :         }
   19071              :       else
   19072              :         {
   19073              :           /* For SSE1, we have to reuse the V4SF code.  */
   19074            0 :           rtx t = gen_reg_rtx (V4SFmode);
   19075            0 :           emit_move_insn (t, gen_lowpart (V4SFmode, target));
   19076            0 :           ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
   19077            0 :           emit_move_insn (target, gen_lowpart (mode, t));
   19078              :         }
   19079              :       return;
   19080              : 
   19081         3533 :     case E_V8HImode:
   19082         3533 :     case E_V8HFmode:
   19083         3533 :     case E_V8BFmode:
   19084         3533 :     case E_V2HImode:
   19085         3533 :     case E_V2HFmode:
   19086         3533 :     case E_V2BFmode:
   19087         3533 :       use_vec_merge = TARGET_SSE2;
   19088         3533 :       break;
   19089           50 :     case E_V4HImode:
   19090           50 :     case E_V4HFmode:
   19091           50 :     case E_V4BFmode:
   19092           50 :       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
   19093              :       break;
   19094              : 
   19095         3067 :     case E_V16QImode:
   19096         3067 :     case E_V4QImode:
   19097         3067 :       use_vec_merge = TARGET_SSE4_1;
   19098         3067 :       break;
   19099              : 
   19100            5 :     case E_V8QImode:
   19101            5 :       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19102              :       break;
   19103              : 
   19104            3 :     case E_V32QImode:
   19105            3 :       half_mode = V16QImode;
   19106            3 :       j = 0;
   19107            3 :       n = 16;
   19108            3 :       goto half;
   19109              : 
   19110           17 :     case E_V16HFmode:
   19111           17 :     case E_V16BFmode:
   19112              :       /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw.  */
   19113           17 :       if (TARGET_AVX2 && elt != 0)
   19114              :         {
   19115           12 :           mmode = SImode;
   19116           12 :           gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
   19117              :                                                 : gen_avx2_pblendbf_1);
   19118              :           blendm_const = true;
   19119              :           break;
   19120              :         }
   19121              :       else
   19122              :         {
   19123            5 :           half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
   19124            3 :           j = ((mode == E_V16HFmode) ? 6 : 7);
   19125            5 :           n = 8;
   19126            5 :           goto half;
   19127              :         }
   19128              : 
   19129            5 :     case E_V16HImode:
   19130            5 :       half_mode = V8HImode;
   19131            5 :       j = 1;
   19132            5 :       n = 8;
   19133            5 :       goto half;
   19134              : 
   19135           15 :     case E_V8SImode:
   19136           15 :       half_mode = V4SImode;
   19137           15 :       j = 2;
   19138           15 :       n = 4;
   19139           15 :       goto half;
   19140              : 
   19141           15 :     case E_V4DImode:
   19142           15 :       half_mode = V2DImode;
   19143           15 :       j = 3;
   19144           15 :       n = 2;
   19145           15 :       goto half;
   19146              : 
   19147            4 :     case E_V8SFmode:
   19148            4 :       half_mode = V4SFmode;
   19149            4 :       j = 4;
   19150            4 :       n = 4;
   19151            4 :       goto half;
   19152              : 
   19153            6 :     case E_V4DFmode:
   19154            6 :       half_mode = V2DFmode;
   19155            6 :       j = 5;
   19156            6 :       n = 2;
   19157            6 :       goto half;
   19158              : 
   19159           53 : half:
   19160              :       /* Compute offset.  */
   19161           53 :       i = elt / n;
   19162           53 :       elt %= n;
   19163              : 
   19164           53 :       gcc_assert (i <= 1);
   19165              : 
   19166              :       /* Extract the half.  */
   19167           53 :       tmp = gen_reg_rtx (half_mode);
   19168           53 :       emit_insn (gen_extract[j][i] (tmp, target));
   19169              : 
   19170              :       /* Put val in tmp at elt.  */
   19171           53 :       ix86_expand_vector_set (false, tmp, val, elt);
   19172              : 
   19173              :       /* Put it back.  */
   19174           53 :       emit_insn (gen_insert[j][i] (target, target, tmp));
   19175           53 :       return;
   19176              : 
   19177            8 :     case E_V8DFmode:
   19178            8 :       if (TARGET_AVX512F)
   19179              :         {
   19180              :           mmode = QImode;
   19181              :           gen_blendm = gen_avx512f_blendmv8df;
   19182              :         }
   19183              :       break;
   19184              : 
   19185            6 :     case E_V8DImode:
   19186            6 :       if (TARGET_AVX512F)
   19187              :         {
   19188              :           mmode = QImode;
   19189              :           gen_blendm = gen_avx512f_blendmv8di;
   19190              :         }
   19191              :       break;
   19192              : 
   19193            0 :     case E_V16SFmode:
   19194            0 :       if (TARGET_AVX512F)
   19195              :         {
   19196              :           mmode = HImode;
   19197              :           gen_blendm = gen_avx512f_blendmv16sf;
   19198              :         }
   19199              :       break;
   19200              : 
   19201            0 :     case E_V16SImode:
   19202            0 :       if (TARGET_AVX512F)
   19203              :         {
   19204              :           mmode = HImode;
   19205              :           gen_blendm = gen_avx512f_blendmv16si;
   19206              :         }
   19207              :       break;
   19208              : 
   19209           12 :     case E_V32HFmode:
   19210           12 :       if (TARGET_AVX512BW)
   19211              :         {
   19212              :           mmode = SImode;
   19213              :           gen_blendm = gen_avx512bw_blendmv32hf;
   19214              :         }
   19215              :       break;
   19216           12 :     case E_V32BFmode:
   19217           12 :       if (TARGET_AVX512BW)
   19218              :         {
   19219              :           mmode = SImode;
   19220              :           gen_blendm = gen_avx512bw_blendmv32bf;
   19221              :         }
   19222              :       break;
   19223           11 :     case E_V32HImode:
   19224           11 :       if (TARGET_AVX512BW)
   19225              :         {
   19226              :           mmode = SImode;
   19227              :           gen_blendm = gen_avx512bw_blendmv32hi;
   19228              :         }
   19229            7 :       else if (TARGET_AVX512F)
   19230              :         {
   19231            7 :           half_mode = E_V8HImode;
   19232            7 :           n = 8;
   19233            7 :           goto quarter;
   19234              :         }
   19235              :       break;
   19236              : 
   19237           12 :     case E_V64QImode:
   19238           12 :       if (TARGET_AVX512BW)
   19239              :         {
   19240              :           mmode = DImode;
   19241              :           gen_blendm = gen_avx512bw_blendmv64qi;
   19242              :         }
   19243            6 :       else if (TARGET_AVX512F)
   19244              :         {
   19245            6 :           half_mode = E_V16QImode;
   19246            6 :           n = 16;
   19247            6 :           goto quarter;
   19248              :         }
   19249              :       break;
   19250              : 
   19251           13 : quarter:
   19252              :       /* Compute offset.  */
   19253           13 :       i = elt / n;
   19254           13 :       elt %= n;
   19255              : 
   19256           13 :       gcc_assert (i <= 3);
   19257              : 
   19258           13 :       {
   19259              :         /* Extract the quarter.  */
   19260           13 :         tmp = gen_reg_rtx (V4SImode);
   19261           13 :         rtx tmp2 = gen_lowpart (V16SImode, target);
   19262           13 :         rtx mask = gen_reg_rtx (QImode);
   19263              : 
   19264           13 :         emit_move_insn (mask, constm1_rtx);
   19265           13 :         emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
   19266              :                                                    tmp, mask));
   19267              : 
   19268           13 :         tmp2 = gen_reg_rtx (half_mode);
   19269           13 :         emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
   19270           13 :         tmp = tmp2;
   19271              : 
   19272              :         /* Put val in tmp at elt.  */
   19273           13 :         ix86_expand_vector_set (false, tmp, val, elt);
   19274              : 
   19275              :         /* Put it back.  */
   19276           13 :         tmp2 = gen_reg_rtx (V16SImode);
   19277           13 :         rtx tmp3 = gen_lowpart (V16SImode, target);
   19278           13 :         mask = gen_reg_rtx (HImode);
   19279           13 :         emit_move_insn (mask, constm1_rtx);
   19280           13 :         tmp = gen_lowpart (V4SImode, tmp);
   19281           13 :         emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
   19282              :                                                   tmp3, mask));
   19283           13 :         emit_move_insn (target, gen_lowpart (mode, tmp2));
   19284              :       }
   19285           13 :       return;
   19286              : 
   19287              :     default:
   19288              :       break;
   19289              :     }
   19290              : 
   19291         6600 :   if (mmode != VOIDmode)
   19292              :     {
   19293           54 :       tmp = gen_reg_rtx (mode);
   19294           54 :       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
   19295           54 :       rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
   19296              :       /* The avx512*_blendm<mode> expanders have different operand order
   19297              :          from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
   19298              :          elements where the mask is set and second input operand otherwise,
   19299              :          in {sse,avx}*_*blend* the first input operand is used for elements
   19300              :          where the mask is clear and second input operand otherwise.  */
   19301           54 :       if (!blendm_const)
   19302           42 :         merge_mask = force_reg (mmode, merge_mask);
   19303           54 :       emit_insn (gen_blendm (target, target, tmp, merge_mask));
   19304              :     }
   19305         7758 :   else if (use_vec_merge)
   19306              :     {
   19307         7748 : do_vec_merge:
   19308         7816 :       if (!nonimmediate_operand (val, inner_mode))
   19309            1 :         val = force_reg (inner_mode, val);
   19310         7816 :       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
   19311         7816 :       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
   19312              :                                GEN_INT (HOST_WIDE_INT_1U << elt));
   19313         7816 :       emit_insn (gen_rtx_SET (target, tmp));
   19314              :     }
   19315              :   else
   19316              :     {
   19317           20 :       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   19318              : 
   19319           10 :       emit_move_insn (mem, target);
   19320              : 
   19321           20 :       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
   19322           10 :       emit_move_insn (tmp, val);
   19323              : 
   19324           10 :       emit_move_insn (target, mem);
   19325              :     }
   19326              : }
   19327              : 
   19328              : void
   19329       109400 : ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
   19330              : {
   19331       109400 :   machine_mode mode = GET_MODE (vec);
   19332       109400 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   19333       109400 :   bool use_vec_extr = false;
   19334       109400 :   rtx tmp;
   19335              : 
   19336       109400 :   switch (mode)
   19337              :     {
   19338         8692 :     case E_V2SImode:
   19339         8692 :       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19340              :       if (use_vec_extr)
   19341              :         break;
   19342              :       /* FALLTHRU */
   19343              : 
   19344         9577 :     case E_V2SFmode:
   19345         9577 :       if (!mmx_ok)
   19346              :         break;
   19347              :       /* FALLTHRU */
   19348              : 
   19349              :     case E_V2DFmode:
   19350              :     case E_V2DImode:
   19351              :     case E_V2TImode:
   19352              :     case E_V4TImode:
   19353              :       use_vec_extr = true;
   19354              :       break;
   19355              : 
   19356         7879 :     case E_V4SFmode:
   19357         7879 :       use_vec_extr = TARGET_SSE4_1;
   19358         7879 :       if (use_vec_extr)
   19359              :         break;
   19360              : 
   19361         4038 :       switch (elt)
   19362              :         {
   19363              :         case 0:
   19364              :           tmp = vec;
   19365              :           break;
   19366              : 
   19367         1675 :         case 1:
   19368         1675 :         case 3:
   19369         1675 :           tmp = gen_reg_rtx (mode);
   19370         1675 :           emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
   19371              :                                        GEN_INT (elt), GEN_INT (elt),
   19372         1675 :                                        GEN_INT (elt+4), GEN_INT (elt+4)));
   19373         1675 :           break;
   19374              : 
   19375          929 :         case 2:
   19376          929 :           tmp = gen_reg_rtx (mode);
   19377          929 :           emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
   19378          929 :           break;
   19379              : 
   19380            0 :         default:
   19381            0 :           gcc_unreachable ();
   19382              :         }
   19383              :       vec = tmp;
   19384              :       use_vec_extr = true;
   19385              :       elt = 0;
   19386              :       break;
   19387              : 
   19388        23828 :     case E_V4SImode:
   19389        23828 :       use_vec_extr = TARGET_SSE4_1;
   19390        23828 :       if (use_vec_extr)
   19391              :         break;
   19392              : 
   19393        18006 :       if (TARGET_SSE2)
   19394              :         {
   19395        18002 :           switch (elt)
   19396              :             {
   19397              :             case 0:
   19398              :               tmp = vec;
   19399              :               break;
   19400              : 
   19401         5897 :             case 1:
   19402         5897 :             case 3:
   19403         5897 :               tmp = gen_reg_rtx (mode);
   19404         5897 :               emit_insn (gen_sse2_pshufd_1 (tmp, vec,
   19405              :                                             GEN_INT (elt), GEN_INT (elt),
   19406              :                                             GEN_INT (elt), GEN_INT (elt)));
   19407         5897 :               break;
   19408              : 
   19409         2934 :             case 2:
   19410         2934 :               tmp = gen_reg_rtx (mode);
   19411         2934 :               emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
   19412         2934 :               break;
   19413              : 
   19414            0 :             default:
   19415            0 :               gcc_unreachable ();
   19416              :             }
   19417              :           vec = tmp;
   19418              :           use_vec_extr = true;
   19419              :           elt = 0;
   19420              :         }
   19421              :       else
   19422              :         {
   19423              :           /* For SSE1, we have to reuse the V4SF code.  */
   19424            4 :           ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
   19425            4 :                                       gen_lowpart (V4SFmode, vec), elt);
   19426            4 :           return;
   19427              :         }
   19428              :       break;
   19429              : 
   19430         6505 :     case E_V8HImode:
   19431         6505 :     case E_V8HFmode:
   19432         6505 :     case E_V8BFmode:
   19433         6505 :     case E_V2HImode:
   19434         6505 :     case E_V2HFmode:
   19435         6505 :     case E_V2BFmode:
   19436         6505 :       use_vec_extr = TARGET_SSE2;
   19437         6505 :       break;
   19438          876 :     case E_V4HImode:
   19439          876 :     case E_V4HFmode:
   19440          876 :     case E_V4BFmode:
   19441          876 :       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
   19442              :       break;
   19443              : 
   19444         7980 :     case E_V16QImode:
   19445         7980 :       use_vec_extr = TARGET_SSE4_1;
   19446         7980 :       if (!use_vec_extr
   19447         6186 :           && TARGET_SSE2
   19448         6186 :           && elt == 0
   19449        11850 :           && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
   19450              :         {
   19451         3869 :           tmp = gen_reg_rtx (SImode);
   19452         3869 :           ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
   19453              :                                       0);
   19454         3869 :           emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
   19455         3869 :           return;
   19456              :         }
   19457              :       break;
   19458           78 :     case E_V4QImode:
   19459           78 :       use_vec_extr = TARGET_SSE4_1;
   19460           78 :       break;
   19461              : 
   19462          604 :     case E_V8SFmode:
   19463          604 :       if (TARGET_AVX)
   19464              :         {
   19465          604 :           tmp = gen_reg_rtx (V4SFmode);
   19466          604 :           if (elt < 4)
   19467          298 :             emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
   19468              :           else
   19469          306 :             emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
   19470          604 :           ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19471          604 :           return;
   19472              :         }
   19473              :       break;
   19474              : 
   19475          565 :     case E_V4DFmode:
   19476          565 :       if (TARGET_AVX)
   19477              :         {
   19478          565 :           tmp = gen_reg_rtx (V2DFmode);
   19479          565 :           if (elt < 2)
   19480          297 :             emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
   19481              :           else
   19482          268 :             emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
   19483          565 :           ix86_expand_vector_extract (false, target, tmp, elt & 1);
   19484          565 :           return;
   19485              :         }
   19486              :       break;
   19487              : 
   19488          253 :     case E_V32QImode:
   19489          253 :       if (TARGET_AVX)
   19490              :         {
   19491          253 :           tmp = gen_reg_rtx (V16QImode);
   19492          253 :           if (elt < 16)
   19493          130 :             emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
   19494              :           else
   19495          123 :             emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
   19496          253 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19497          253 :           return;
   19498              :         }
   19499              :       break;
   19500              : 
   19501          616 :     case E_V16HImode:
   19502          616 :       if (TARGET_AVX)
   19503              :         {
   19504          616 :           tmp = gen_reg_rtx (V8HImode);
   19505          616 :           if (elt < 8)
   19506          304 :             emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
   19507              :           else
   19508          312 :             emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
   19509          616 :           ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19510          616 :           return;
   19511              :         }
   19512              :       break;
   19513              : 
   19514         1093 :     case E_V8SImode:
   19515         1093 :       if (TARGET_AVX)
   19516              :         {
   19517         1093 :           tmp = gen_reg_rtx (V4SImode);
   19518         1093 :           if (elt < 4)
   19519          527 :             emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
   19520              :           else
   19521          566 :             emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
   19522         1093 :           ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19523         1093 :           return;
   19524              :         }
   19525              :       break;
   19526              : 
   19527         1558 :     case E_V4DImode:
   19528         1558 :       if (TARGET_AVX)
   19529              :         {
   19530         1558 :           tmp = gen_reg_rtx (V2DImode);
   19531         1558 :           if (elt < 2)
   19532          833 :             emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
   19533              :           else
   19534          725 :             emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
   19535         1558 :           ix86_expand_vector_extract (false, target, tmp, elt & 1);
   19536         1558 :           return;
   19537              :         }
   19538              :       break;
   19539              : 
   19540            8 :     case E_V32HImode:
   19541            8 :       if (TARGET_AVX512BW)
   19542              :         {
   19543            8 :           tmp = gen_reg_rtx (V16HImode);
   19544            8 :           if (elt < 16)
   19545            3 :             emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
   19546              :           else
   19547            5 :             emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
   19548            8 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19549            8 :           return;
   19550              :         }
   19551              :       break;
   19552              : 
   19553           11 :     case E_V64QImode:
   19554           11 :       if (TARGET_AVX512BW)
   19555              :         {
   19556           11 :           tmp = gen_reg_rtx (V32QImode);
   19557           11 :           if (elt < 32)
   19558            5 :             emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
   19559              :           else
   19560            6 :             emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
   19561           11 :           ix86_expand_vector_extract (false, target, tmp, elt & 31);
   19562           11 :           return;
   19563              :         }
   19564              :       break;
   19565              : 
   19566          311 :     case E_V16SFmode:
   19567          311 :       tmp = gen_reg_rtx (V8SFmode);
   19568          311 :       if (elt < 8)
   19569          157 :         emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
   19570              :       else
   19571          154 :         emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
   19572          311 :       ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19573          311 :       return;
   19574              : 
   19575          296 :     case E_V8DFmode:
   19576          296 :       tmp = gen_reg_rtx (V4DFmode);
   19577          296 :       if (elt < 4)
   19578          160 :         emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
   19579              :       else
   19580          136 :         emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
   19581          296 :       ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19582          296 :       return;
   19583              : 
   19584          332 :     case E_V16SImode:
   19585          332 :       tmp = gen_reg_rtx (V8SImode);
   19586          332 :       if (elt < 8)
   19587          163 :         emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
   19588              :       else
   19589          169 :         emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
   19590          332 :       ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19591          332 :       return;
   19592              : 
   19593          738 :     case E_V8DImode:
   19594          738 :       tmp = gen_reg_rtx (V4DImode);
   19595          738 :       if (elt < 4)
   19596          419 :         emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
   19597              :       else
   19598          319 :         emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
   19599          738 :       ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19600          738 :       return;
   19601              : 
   19602           45 :     case E_V32HFmode:
   19603           45 :     case E_V32BFmode:
   19604           45 :       if (TARGET_AVX512BW)
   19605              :         {
   19606           45 :           tmp = (mode == E_V32HFmode
   19607           45 :                  ? gen_reg_rtx (V16HFmode)
   19608            7 :                  : gen_reg_rtx (V16BFmode));
   19609           45 :           if (elt < 16)
   19610           31 :             emit_insn (gen_vec_extract_lo (mode, tmp, vec));
   19611              :           else
   19612           14 :             emit_insn (gen_vec_extract_hi (mode, tmp, vec));
   19613           45 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19614           45 :           return;
   19615              :         }
   19616              :       break;
   19617              : 
   19618          474 :     case E_V16HFmode:
   19619          474 :     case E_V16BFmode:
   19620          474 :       if (TARGET_AVX)
   19621              :         {
   19622          474 :           tmp = (mode == E_V16HFmode
   19623          474 :                  ? gen_reg_rtx (V8HFmode)
   19624          339 :                  : gen_reg_rtx (V8BFmode));
   19625          474 :           if (elt < 8)
   19626          249 :             emit_insn (gen_vec_extract_lo (mode, tmp, vec));
   19627              :           else
   19628          225 :             emit_insn (gen_vec_extract_hi (mode, tmp, vec));
   19629          474 :           ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19630          474 :           return;
   19631              :         }
   19632              :       break;
   19633              : 
   19634          627 :     case E_V8QImode:
   19635          627 :       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19636              :       /* ??? Could extract the appropriate HImode element and shift.  */
   19637              :       break;
   19638              : 
   19639              :     default:
   19640              :       break;
   19641              :     }
   19642              : 
   19643        27104 :   if (use_vec_extr)
   19644              :     {
   19645        90709 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
   19646        90709 :       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
   19647              : 
   19648              :       /* Let the rtl optimizers know about the zero extension performed.  */
   19649        90709 :       if (inner_mode == QImode || inner_mode == HImode)
   19650              :         {
   19651         8704 :           rtx reg = gen_reg_rtx (SImode);
   19652         8704 :           tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
   19653         8704 :           emit_move_insn (reg, tmp);
   19654         8704 :           tmp = gen_lowpart (inner_mode, reg);
   19655         8704 :           SUBREG_PROMOTED_VAR_P (tmp) = 1;
   19656         8704 :           SUBREG_PROMOTED_SET (tmp, 1);
   19657              :         }
   19658              : 
   19659        90709 :       emit_move_insn (target, tmp);
   19660              :     }
   19661              :   else
   19662              :     {
   19663        15828 :       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   19664              : 
   19665         7914 :       emit_move_insn (mem, vec);
   19666              : 
   19667        15828 :       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
   19668         7914 :       emit_move_insn (target, tmp);
   19669              :     }
   19670              : }
   19671              : 
   19672              : /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
   19673              :    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
   19674              :    The upper bits of DEST are undefined, though they shouldn't cause
   19675              :    exceptions (some bits from src or all zeros are ok).  */
   19676              : 
   19677              : static void
   19678        41877 : emit_reduc_half (rtx dest, rtx src, int i)
   19679              : {
   19680        41877 :   rtx tem, d = dest;
   19681        41877 :   switch (GET_MODE (src))
   19682              :     {
   19683         6042 :     case E_V4SFmode:
   19684         6042 :       if (i == 128)
   19685         3021 :         tem = gen_sse_movhlps (dest, src, src);
   19686              :       else
   19687         3021 :         tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
   19688              :                                    GEN_INT (1 + 4), GEN_INT (1 + 4));
   19689              :       break;
   19690         3362 :     case E_V2DFmode:
   19691         3362 :       tem = gen_vec_interleave_highv2df (dest, src, src);
   19692         3362 :       break;
   19693           76 :     case E_V4QImode:
   19694           76 :       d = gen_reg_rtx (V1SImode);
   19695           76 :       tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
   19696           76 :                                GEN_INT (i / 2));
   19697           76 :       break;
   19698          615 :     case E_V8QImode:
   19699          615 :     case E_V4HImode:
   19700          615 :       d = gen_reg_rtx (V1DImode);
   19701          615 :       tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
   19702          615 :                                GEN_INT (i / 2));
   19703          615 :       break;
   19704        31782 :     case E_V16QImode:
   19705        31782 :     case E_V8HImode:
   19706        31782 :     case E_V8HFmode:
   19707        31782 :     case E_V4SImode:
   19708        31782 :     case E_V2DImode:
   19709        31782 :       if (TARGET_SSE_REDUCTION_PREFER_PSHUF)
   19710              :         {
   19711           19 :           if (i == 128)
   19712              :             {
   19713           13 :               d = gen_reg_rtx (V4SImode);
   19714           26 :               tem = gen_sse2_pshufd_1 (
   19715           13 :                   d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
   19716              :                   GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3));
   19717           13 :               break;
   19718              :             }
   19719            6 :           else if (i == 64)
   19720              :             {
   19721            5 :               d = gen_reg_rtx (V4SImode);
   19722           10 :               tem = gen_sse2_pshufd_1 (
   19723            5 :                   d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
   19724              :                   GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
   19725            5 :               break;
   19726              :             }
   19727            1 :           else if (i == 32)
   19728              :             {
   19729            1 :               d = gen_reg_rtx (V8HImode);
   19730            2 :               tem = gen_sse2_pshuflw_1 (
   19731            1 :                   d, force_reg (V8HImode, gen_lowpart (V8HImode, src)),
   19732              :                   GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
   19733            1 :               break;
   19734              :             }
   19735              :         }
   19736        31763 :       d = gen_reg_rtx (V1TImode);
   19737        31763 :       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
   19738        31763 :                                 GEN_INT (i / 2));
   19739        31763 :       break;
   19740            0 :     case E_V8SFmode:
   19741            0 :       if (i == 256)
   19742            0 :         tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
   19743              :       else
   19744            0 :         tem = gen_avx_shufps256 (dest, src, src,
   19745              :                                  GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
   19746              :       break;
   19747            0 :     case E_V4DFmode:
   19748            0 :       if (i == 256)
   19749            0 :         tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
   19750              :       else
   19751            0 :         tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
   19752              :       break;
   19753            0 :     case E_V32QImode:
   19754            0 :     case E_V16HImode:
   19755            0 :     case E_V16HFmode:
   19756            0 :     case E_V8SImode:
   19757            0 :     case E_V4DImode:
   19758            0 :       if (i == 256)
   19759              :         {
   19760            0 :           if (GET_MODE (dest) != V4DImode)
   19761            0 :             d = gen_reg_rtx (V4DImode);
   19762            0 :           tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
   19763            0 :                                    gen_lowpart (V4DImode, src),
   19764              :                                    const1_rtx);
   19765              :         }
   19766              :       else
   19767              :         {
   19768            0 :           d = gen_reg_rtx (V2TImode);
   19769            0 :           tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
   19770            0 :                                     GEN_INT (i / 2));
   19771              :         }
   19772              :       break;
   19773            0 :     case E_V64QImode:
   19774            0 :     case E_V32HImode:
   19775            0 :     case E_V32HFmode:
   19776            0 :       if (i < 64)
   19777              :         {
   19778            0 :           d = gen_reg_rtx (V4TImode);
   19779            0 :           tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
   19780            0 :                                         GEN_INT (i / 2));
   19781            0 :           break;
   19782              :         }
   19783              :       /* FALLTHRU */
   19784            0 :     case E_V16SImode:
   19785            0 :     case E_V16SFmode:
   19786            0 :     case E_V8DImode:
   19787            0 :     case E_V8DFmode:
   19788            0 :       if (i > 128)
   19789            0 :         tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
   19790            0 :                                         gen_lowpart (V16SImode, src),
   19791            0 :                                         gen_lowpart (V16SImode, src),
   19792              :                                         GEN_INT (0x4 + (i == 512 ? 4 : 0)),
   19793              :                                         GEN_INT (0x5 + (i == 512 ? 4 : 0)),
   19794              :                                         GEN_INT (0x6 + (i == 512 ? 4 : 0)),
   19795              :                                         GEN_INT (0x7 + (i == 512 ? 4 : 0)),
   19796              :                                         GEN_INT (0xC), GEN_INT (0xD),
   19797              :                                         GEN_INT (0xE), GEN_INT (0xF),
   19798              :                                         GEN_INT (0x10), GEN_INT (0x11),
   19799              :                                         GEN_INT (0x12), GEN_INT (0x13),
   19800              :                                         GEN_INT (0x14), GEN_INT (0x15),
   19801              :                                         GEN_INT (0x16), GEN_INT (0x17));
   19802              :       else
   19803            0 :         tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
   19804            0 :                                     gen_lowpart (V16SImode, src),
   19805              :                                     GEN_INT (i == 128 ? 0x2 : 0x1),
   19806              :                                     GEN_INT (0x3),
   19807              :                                     GEN_INT (0x3),
   19808              :                                     GEN_INT (0x3),
   19809              :                                     GEN_INT (i == 128 ? 0x6 : 0x5),
   19810              :                                     GEN_INT (0x7),
   19811              :                                     GEN_INT (0x7),
   19812              :                                     GEN_INT (0x7),
   19813              :                                     GEN_INT (i == 128 ? 0xA : 0x9),
   19814              :                                     GEN_INT (0xB),
   19815              :                                     GEN_INT (0xB),
   19816              :                                     GEN_INT (0xB),
   19817              :                                     GEN_INT (i == 128 ? 0xE : 0xD),
   19818              :                                     GEN_INT (0xF),
   19819              :                                     GEN_INT (0xF),
   19820              :                                     GEN_INT (0xF));
   19821              :       break;
   19822            0 :     default:
   19823            0 :       gcc_unreachable ();
   19824              :     }
   19825        41877 :   emit_insn (tem);
   19826        41877 :   if (d != dest)
   19827        32473 :     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
   19828        41877 : }
   19829              : 
   19830              : /* Expand a vector reduction.  FN is the binary pattern to reduce;
   19831              :    DEST is the destination; IN is the input vector.  */
   19832              : 
   19833              : void
   19834        20907 : ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
   19835              : {
   19836        20907 :   rtx half, dst, vec = in;
   19837        20907 :   machine_mode mode = GET_MODE (in);
   19838        20907 :   int i;
   19839              : 
   19840              :   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
   19841        20907 :   if (TARGET_SSE4_1
   19842         9946 :       && mode == V8HImode
   19843          780 :       && fn == gen_uminv8hi3)
   19844              :     {
   19845            4 :       emit_insn (gen_sse4_1_phminposuw (dest, in));
   19846            4 :       return;
   19847              :     }
   19848              : 
   19849        41806 :   for (i = GET_MODE_BITSIZE (mode);
   19850       125560 :        i > GET_MODE_UNIT_BITSIZE (mode);
   19851        41877 :        i >>= 1)
   19852              :     {
   19853        41877 :       half = gen_reg_rtx (mode);
   19854        41877 :       emit_reduc_half (half, vec, i);
   19855        83754 :       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
   19856              :         dst = dest;
   19857              :       else
   19858        20974 :         dst = gen_reg_rtx (mode);
   19859        41877 :       emit_insn (fn (dst, half, vec));
   19860        41877 :       vec = dst;
   19861              :     }
   19862              : }
   19863              : 
   19864              : /* Output code to perform a conditional jump to LABEL, if C2 flag in
   19865              :    FP status register is set.  */
   19866              : 
   19867              : void
   19868          284 : ix86_emit_fp_unordered_jump (rtx label)
   19869              : {
   19870          284 :   rtx reg = gen_reg_rtx (HImode);
   19871          284 :   rtx_insn *insn;
   19872          284 :   rtx temp;
   19873              : 
   19874          284 :   emit_insn (gen_x86_fnstsw_1 (reg));
   19875              : 
   19876          284 :   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
   19877              :     {
   19878           37 :       emit_insn (gen_x86_sahf_1 (reg));
   19879              : 
   19880           37 :       temp = gen_rtx_REG (CCmode, FLAGS_REG);
   19881           37 :       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
   19882              :     }
   19883              :   else
   19884              :     {
   19885          247 :       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
   19886              : 
   19887          247 :       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   19888          247 :       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
   19889              :     }
   19890              : 
   19891          284 :   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
   19892              :                               gen_rtx_LABEL_REF (VOIDmode, label),
   19893              :                               pc_rtx);
   19894          284 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
   19895          284 :   predict_jump (REG_BR_PROB_BASE * 10 / 100);
   19896          284 :   JUMP_LABEL (insn) = label;
   19897          284 : }
   19898              : 
   19899              : /* Output code to perform an sinh XFmode calculation.  */
   19900              : 
   19901              : void
   19902            2 : ix86_emit_i387_sinh (rtx op0, rtx op1)
   19903              : {
   19904            2 :   rtx e1 = gen_reg_rtx (XFmode);
   19905            2 :   rtx e2 = gen_reg_rtx (XFmode);
   19906            2 :   rtx scratch = gen_reg_rtx (HImode);
   19907            2 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   19908            2 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   19909            2 :   rtx cst1, tmp;
   19910            2 :   rtx_code_label *jump_label = gen_label_rtx ();
   19911            2 :   rtx_insn *insn;
   19912              : 
   19913              :   /* scratch = fxam (op1) */
   19914            2 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   19915              : 
   19916              :   /* e1 = expm1 (|op1|) */
   19917            2 :   emit_insn (gen_absxf2 (e2, op1));
   19918            2 :   emit_insn (gen_expm1xf2 (e1, e2));
   19919              : 
   19920              :   /* e2 = e1 / (e1 + 1.0) + e1 */
   19921            2 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   19922            2 :   emit_insn (gen_addxf3 (e2, e1, cst1));
   19923            2 :   emit_insn (gen_divxf3 (e2, e1, e2));
   19924            2 :   emit_insn (gen_addxf3 (e2, e2, e1));
   19925              : 
   19926              :   /* flags = signbit (op1) */
   19927            2 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   19928              : 
   19929              :   /* if (flags) then e2 = -e2 */
   19930            2 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   19931              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   19932              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   19933              :                               pc_rtx);
   19934            2 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   19935            2 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   19936            2 :   JUMP_LABEL (insn) = jump_label;
   19937              : 
   19938            2 :   emit_insn (gen_negxf2 (e2, e2));
   19939              : 
   19940            2 :   emit_label (jump_label);
   19941            2 :   LABEL_NUSES (jump_label) = 1;
   19942              : 
   19943              :   /* op0 = 0.5 * e2 */
   19944            2 :   half = force_reg (XFmode, half);
   19945            2 :   emit_insn (gen_mulxf3 (op0, e2, half));
   19946            2 : }
   19947              : 
   19948              : /* Output code to perform an cosh XFmode calculation.  */
   19949              : 
   19950              : void
   19951            3 : ix86_emit_i387_cosh (rtx op0, rtx op1)
   19952              : {
   19953            3 :   rtx e1 = gen_reg_rtx (XFmode);
   19954            3 :   rtx e2 = gen_reg_rtx (XFmode);
   19955            3 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   19956            3 :   rtx cst1;
   19957              : 
   19958              :   /* e1 = exp (op1) */
   19959            3 :   emit_insn (gen_expxf2 (e1, op1));
   19960              : 
   19961              :   /* e2 = e1 + 1.0 / e1 */
   19962            3 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   19963            3 :   emit_insn (gen_divxf3 (e2, cst1, e1));
   19964            3 :   emit_insn (gen_addxf3 (e2, e1, e2));
   19965              : 
   19966              :   /* op0 = 0.5 * e2 */
   19967            3 :   half = force_reg (XFmode, half);
   19968            3 :   emit_insn (gen_mulxf3 (op0, e2, half));
   19969            3 : }
   19970              : 
   19971              : /* Output code to perform an tanh XFmode calculation.  */
   19972              : 
   19973              : void
   19974            1 : ix86_emit_i387_tanh (rtx op0, rtx op1)
   19975              : {
   19976            1 :   rtx e1 = gen_reg_rtx (XFmode);
   19977            1 :   rtx e2 = gen_reg_rtx (XFmode);
   19978            1 :   rtx scratch = gen_reg_rtx (HImode);
   19979            1 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   19980            1 :   rtx cst2, tmp;
   19981            1 :   rtx_code_label *jump_label = gen_label_rtx ();
   19982            1 :   rtx_insn *insn;
   19983              : 
   19984              :   /* scratch = fxam (op1) */
   19985            1 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   19986              : 
   19987              :   /* e1 = expm1 (-|2 * op1|) */
   19988            1 :   emit_insn (gen_addxf3 (e2, op1, op1));
   19989            1 :   emit_insn (gen_absxf2 (e2, e2));
   19990            1 :   emit_insn (gen_negxf2 (e2, e2));
   19991            1 :   emit_insn (gen_expm1xf2 (e1, e2));
   19992              : 
   19993              :   /* e2 = e1 / (e1 + 2.0) */
   19994            1 :   cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
   19995            1 :   emit_insn (gen_addxf3 (e2, e1, cst2));
   19996            1 :   emit_insn (gen_divxf3 (e2, e1, e2));
   19997              : 
   19998              :   /* flags = signbit (op1) */
   19999            1 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20000              : 
   20001              :   /* if (!flags) then e2 = -e2 */
   20002            1 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20003              :                               gen_rtx_NE (VOIDmode, flags, const0_rtx),
   20004              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20005              :                               pc_rtx);
   20006            1 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20007            1 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20008            1 :   JUMP_LABEL (insn) = jump_label;
   20009              : 
   20010            1 :   emit_insn (gen_negxf2 (e2, e2));
   20011              : 
   20012            1 :   emit_label (jump_label);
   20013            1 :   LABEL_NUSES (jump_label) = 1;
   20014              : 
   20015            1 :   emit_move_insn (op0, e2);
   20016            1 : }
   20017              : 
   20018              : /* Output code to perform an asinh XFmode calculation.  */
   20019              : 
   20020              : void
   20021            0 : ix86_emit_i387_asinh (rtx op0, rtx op1)
   20022              : {
   20023            0 :   rtx e1 = gen_reg_rtx (XFmode);
   20024            0 :   rtx e2 = gen_reg_rtx (XFmode);
   20025            0 :   rtx scratch = gen_reg_rtx (HImode);
   20026            0 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20027            0 :   rtx cst1, tmp;
   20028            0 :   rtx_code_label *jump_label = gen_label_rtx ();
   20029            0 :   rtx_insn *insn;
   20030              : 
   20031              :   /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
   20032            0 :   emit_insn (gen_mulxf3 (e1, op1, op1));
   20033            0 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20034            0 :   emit_insn (gen_addxf3 (e2, e1, cst1));
   20035            0 :   emit_insn (gen_sqrtxf2 (e2, e2));
   20036            0 :   emit_insn (gen_addxf3 (e2, e2, cst1));
   20037              : 
   20038              :   /* e1 = e1 / e2 */
   20039            0 :   emit_insn (gen_divxf3 (e1, e1, e2));
   20040              : 
   20041              :   /* scratch = fxam (op1) */
   20042            0 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20043              : 
   20044              :   /* e1 = e1 + |op1| */
   20045            0 :   emit_insn (gen_absxf2 (e2, op1));
   20046            0 :   emit_insn (gen_addxf3 (e1, e1, e2));
   20047              : 
   20048              :   /* e2 = log1p (e1) */
   20049            0 :   ix86_emit_i387_log1p (e2, e1);
   20050              : 
   20051              :   /* flags = signbit (op1) */
   20052            0 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20053              : 
   20054              :   /* if (flags) then e2 = -e2 */
   20055            0 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20056              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   20057              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20058              :                               pc_rtx);
   20059            0 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20060            0 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20061            0 :   JUMP_LABEL (insn) = jump_label;
   20062              : 
   20063            0 :   emit_insn (gen_negxf2 (e2, e2));
   20064              : 
   20065            0 :   emit_label (jump_label);
   20066            0 :   LABEL_NUSES (jump_label) = 1;
   20067              : 
   20068            0 :   emit_move_insn (op0, e2);
   20069            0 : }
   20070              : 
   20071              : /* Output code to perform an acosh XFmode calculation.  */
   20072              : 
   20073              : void
   20074            0 : ix86_emit_i387_acosh (rtx op0, rtx op1)
   20075              : {
   20076            0 :   rtx e1 = gen_reg_rtx (XFmode);
   20077            0 :   rtx e2 = gen_reg_rtx (XFmode);
   20078            0 :   rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20079              : 
   20080              :   /* e2 = sqrt (op1 + 1.0) */
   20081            0 :   emit_insn (gen_addxf3 (e2, op1, cst1));
   20082            0 :   emit_insn (gen_sqrtxf2 (e2, e2));
   20083              : 
   20084              :   /* e1 = sqrt (op1 - 1.0) */
   20085            0 :   emit_insn (gen_subxf3 (e1, op1, cst1));
   20086            0 :   emit_insn (gen_sqrtxf2 (e1, e1));
   20087              : 
   20088              :   /* e1 = e1 * e2 */
   20089            0 :   emit_insn (gen_mulxf3 (e1, e1, e2));
   20090              : 
   20091              :   /* e1 = e1 + op1 */
   20092            0 :   emit_insn (gen_addxf3 (e1, e1, op1));
   20093              : 
   20094              :   /* op0 = log (e1) */
   20095            0 :   emit_insn (gen_logxf2 (op0, e1));
   20096            0 : }
   20097              : 
   20098              : /* Output code to perform an atanh XFmode calculation.  */
   20099              : 
   20100              : void
   20101            4 : ix86_emit_i387_atanh (rtx op0, rtx op1)
   20102              : {
   20103            4 :   rtx e1 = gen_reg_rtx (XFmode);
   20104            4 :   rtx e2 = gen_reg_rtx (XFmode);
   20105            4 :   rtx scratch = gen_reg_rtx (HImode);
   20106            4 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20107            4 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   20108            4 :   rtx cst1, tmp;
   20109            4 :   rtx_code_label *jump_label = gen_label_rtx ();
   20110            4 :   rtx_insn *insn;
   20111              : 
   20112              :   /* scratch = fxam (op1) */
   20113            4 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20114              : 
   20115              :   /* e2 = |op1| */
   20116            4 :   emit_insn (gen_absxf2 (e2, op1));
   20117              : 
   20118              :   /* e1 = -(e2 + e2) / (e2 + 1.0) */
   20119            4 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20120            4 :   emit_insn (gen_addxf3 (e1, e2, cst1));
   20121            4 :   emit_insn (gen_addxf3 (e2, e2, e2));
   20122            4 :   emit_insn (gen_negxf2 (e2, e2));
   20123            4 :   emit_insn (gen_divxf3 (e1, e2, e1));
   20124              : 
   20125              :   /* e2 = log1p (e1) */
   20126            4 :   ix86_emit_i387_log1p (e2, e1);
   20127              : 
   20128              :   /* flags = signbit (op1) */
   20129            4 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20130              : 
   20131              :   /* if (!flags) then e2 = -e2 */
   20132            4 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20133              :                               gen_rtx_NE (VOIDmode, flags, const0_rtx),
   20134              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20135              :                               pc_rtx);
   20136            4 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20137            4 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20138            4 :   JUMP_LABEL (insn) = jump_label;
   20139              : 
   20140            4 :   emit_insn (gen_negxf2 (e2, e2));
   20141              : 
   20142            4 :   emit_label (jump_label);
   20143            4 :   LABEL_NUSES (jump_label) = 1;
   20144              : 
   20145              :   /* op0 = 0.5 * e2 */
   20146            4 :   half = force_reg (XFmode, half);
   20147            4 :   emit_insn (gen_mulxf3 (op0, e2, half));
   20148            4 : }
   20149              : 
   20150              : /* Output code to perform a log1p XFmode calculation.  */
   20151              : 
   20152              : void
   20153            5 : ix86_emit_i387_log1p (rtx op0, rtx op1)
   20154              : {
   20155            5 :   rtx_code_label *label1 = gen_label_rtx ();
   20156            5 :   rtx_code_label *label2 = gen_label_rtx ();
   20157              : 
   20158            5 :   rtx tmp = gen_reg_rtx (XFmode);
   20159            5 :   rtx res = gen_reg_rtx (XFmode);
   20160            5 :   rtx cst, cstln2, cst1;
   20161            5 :   rtx_insn *insn;
   20162              : 
   20163              :   /* The emit_jump call emits pending stack adjust, make sure it is emitted
   20164              :      before the conditional jump, otherwise the stack adjustment will be
   20165              :      only conditional.  */
   20166            5 :   do_pending_stack_adjust ();
   20167              : 
   20168            5 :   cst = const_double_from_real_value
   20169            5 :     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
   20170            5 :   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
   20171              : 
   20172            5 :   emit_insn (gen_absxf2 (tmp, op1));
   20173              : 
   20174            5 :   cst = force_reg (XFmode, cst);
   20175            5 :   ix86_expand_branch (GE, tmp, cst, label1);
   20176            5 :   predict_jump (REG_BR_PROB_BASE * 10 / 100);
   20177            5 :   insn = get_last_insn ();
   20178            5 :   JUMP_LABEL (insn) = label1;
   20179              : 
   20180            5 :   emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
   20181            5 :   emit_jump (label2);
   20182              : 
   20183            5 :   emit_label (label1);
   20184            5 :   LABEL_NUSES (label1) = 1;
   20185              : 
   20186            5 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20187            5 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
   20188            5 :   emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
   20189              : 
   20190            5 :   emit_label (label2);
   20191            5 :   LABEL_NUSES (label2) = 1;
   20192              : 
   20193            5 :   emit_move_insn (op0, res);
   20194            5 : }
   20195              : 
   20196              : /* Emit code for round calculation.  */
   20197              : void
   20198           68 : ix86_emit_i387_round (rtx op0, rtx op1)
   20199              : {
   20200           68 :   machine_mode inmode = GET_MODE (op1);
   20201           68 :   machine_mode outmode = GET_MODE (op0);
   20202           68 :   rtx e1 = gen_reg_rtx (XFmode);
   20203           68 :   rtx e2 = gen_reg_rtx (XFmode);
   20204           68 :   rtx scratch = gen_reg_rtx (HImode);
   20205           68 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20206           68 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   20207           68 :   rtx res = gen_reg_rtx (outmode);
   20208           68 :   rtx_code_label *jump_label = gen_label_rtx ();
   20209           68 :   rtx (*floor_insn) (rtx, rtx);
   20210           68 :   rtx (*neg_insn) (rtx, rtx);
   20211           68 :   rtx_insn *insn;
   20212           68 :   rtx tmp;
   20213              : 
   20214           68 :   switch (inmode)
   20215              :     {
   20216           37 :     case E_SFmode:
   20217           37 :     case E_DFmode:
   20218           37 :       tmp = gen_reg_rtx (XFmode);
   20219              : 
   20220           37 :       emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
   20221           37 :       op1 = tmp;
   20222           37 :       break;
   20223              :     case E_XFmode:
   20224              :       break;
   20225            0 :     default:
   20226            0 :       gcc_unreachable ();
   20227              :     }
   20228              : 
   20229           68 :   switch (outmode)
   20230              :     {
   20231              :     case E_SFmode:
   20232              :       floor_insn = gen_frndintxf2_floor;
   20233              :       neg_insn = gen_negsf2;
   20234              :       break;
   20235            8 :     case E_DFmode:
   20236            8 :       floor_insn = gen_frndintxf2_floor;
   20237            8 :       neg_insn = gen_negdf2;
   20238            8 :       break;
   20239           10 :     case E_XFmode:
   20240           10 :       floor_insn = gen_frndintxf2_floor;
   20241           10 :       neg_insn = gen_negxf2;
   20242           10 :       break;
   20243            0 :     case E_HImode:
   20244            0 :       floor_insn = gen_lfloorxfhi2;
   20245            0 :       neg_insn = gen_neghi2;
   20246            0 :       break;
   20247           10 :     case E_SImode:
   20248           10 :       floor_insn = gen_lfloorxfsi2;
   20249           10 :       neg_insn = gen_negsi2;
   20250           10 :       break;
   20251           36 :     case E_DImode:
   20252           36 :       floor_insn = gen_lfloorxfdi2;
   20253           36 :       neg_insn = gen_negdi2;
   20254           36 :       break;
   20255            0 :     default:
   20256            0 :       gcc_unreachable ();
   20257              :     }
   20258              : 
   20259              :   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
   20260              : 
   20261              :   /* scratch = fxam(op1) */
   20262           68 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20263              : 
   20264              :   /* e1 = fabs(op1) */
   20265           68 :   emit_insn (gen_absxf2 (e1, op1));
   20266              : 
   20267              :   /* e2 = e1 + 0.5 */
   20268           68 :   half = force_reg (XFmode, half);
   20269           68 :   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
   20270              : 
   20271              :   /* res = floor(e2) */
   20272           68 :   switch (outmode)
   20273              :     {
   20274           12 :     case E_SFmode:
   20275           12 :     case E_DFmode:
   20276           12 :       {
   20277           12 :         tmp = gen_reg_rtx (XFmode);
   20278              : 
   20279           12 :         emit_insn (floor_insn (tmp, e2));
   20280           12 :         emit_insn (gen_rtx_SET (res,
   20281              :                                 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
   20282              :                                                 UNSPEC_TRUNC_NOOP)));
   20283              :       }
   20284           12 :       break;
   20285           56 :     default:
   20286           56 :       emit_insn (floor_insn (res, e2));
   20287              :     }
   20288              : 
   20289              :   /* flags = signbit(a) */
   20290           68 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20291              : 
   20292              :   /* if (flags) then res = -res */
   20293           68 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20294              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   20295              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20296              :                               pc_rtx);
   20297           68 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20298           68 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20299           68 :   JUMP_LABEL (insn) = jump_label;
   20300              : 
   20301           68 :   emit_insn (neg_insn (res, res));
   20302              : 
   20303           68 :   emit_label (jump_label);
   20304           68 :   LABEL_NUSES (jump_label) = 1;
   20305              : 
   20306           68 :   emit_move_insn (op0, res);
   20307           68 : }
   20308              : 
   20309              : /* Output code to perform a Newton-Rhapson approximation of a single precision
   20310              :    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
   20311              : 
   20312              : void
   20313           56 : ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
   20314              : {
   20315           56 :   rtx x0, x1, e0, e1;
   20316              : 
   20317           56 :   x0 = gen_reg_rtx (mode);
   20318           56 :   e0 = gen_reg_rtx (mode);
   20319           56 :   e1 = gen_reg_rtx (mode);
   20320           56 :   x1 = gen_reg_rtx (mode);
   20321              : 
   20322           56 :   b = force_reg (mode, b);
   20323              : 
   20324              :   /* x0 = rcp(b) estimate */
   20325           56 :   if (mode == V16SFmode || mode == V8DFmode)
   20326              :     {
   20327            0 :       emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   20328              :                                                   UNSPEC_RCP14)));
   20329              :     }
   20330              :   else
   20331           56 :     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   20332              :                                                 UNSPEC_RCP)));
   20333              : 
   20334           56 :   unsigned vector_size = GET_MODE_SIZE (mode);
   20335              : 
   20336              :   /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
   20337              :      N-R step with 2 fma implementation.  */
   20338           56 :   if (TARGET_FMA
   20339           55 :       || (TARGET_AVX512F && vector_size == 64)
   20340           55 :       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
   20341              :     {
   20342              :       /* e0 = x0 * a  */
   20343            1 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
   20344              :       /* e1 = e0 * b - a  */
   20345            1 :       emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
   20346              :                                                gen_rtx_NEG (mode, a))));
   20347              :       /* res = - e1 * x0 + e0  */
   20348            1 :       emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
   20349              :                                                gen_rtx_NEG (mode, e1),
   20350              :                                                x0, e0)));
   20351              :     }
   20352              :   else
   20353              :     /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
   20354              :     {
   20355              :       /* e0 = x0 * b */
   20356           55 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
   20357              : 
   20358              :       /* e1 = x0 + x0 */
   20359           55 :       emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
   20360              : 
   20361              :       /* e0 = x0 * e0 */
   20362           55 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
   20363              : 
   20364              :       /* x1 = e1 - e0 */
   20365           55 :       emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
   20366              : 
   20367              :       /* res = a * x1 */
   20368           55 :       emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
   20369              :     }
   20370           56 : }
   20371              : 
   20372              : /* Output code to perform a Newton-Rhapson approximation of a
   20373              :    single precision floating point [reciprocal] square root.  */
   20374              : 
   20375              : void
   20376           85 : ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
   20377              : {
   20378           85 :   rtx x0, e0, e1, e2, e3, mthree, mhalf;
   20379           85 :   REAL_VALUE_TYPE r;
   20380           85 :   int unspec;
   20381              : 
   20382           85 :   x0 = gen_reg_rtx (mode);
   20383           85 :   e0 = gen_reg_rtx (mode);
   20384           85 :   e1 = gen_reg_rtx (mode);
   20385           85 :   e2 = gen_reg_rtx (mode);
   20386           85 :   e3 = gen_reg_rtx (mode);
   20387              : 
   20388           85 :   real_from_integer (&r, VOIDmode, -3, SIGNED);
   20389           85 :   mthree = const_double_from_real_value (r, SFmode);
   20390              : 
   20391           85 :   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
   20392           85 :   mhalf = const_double_from_real_value (r, SFmode);
   20393           85 :   unspec = UNSPEC_RSQRT;
   20394              : 
   20395           85 :   if (VECTOR_MODE_P (mode))
   20396              :     {
   20397           66 :       mthree = ix86_build_const_vector (mode, true, mthree);
   20398           66 :       mhalf = ix86_build_const_vector (mode, true, mhalf);
   20399              :       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
   20400          132 :       if (GET_MODE_SIZE (mode) == 64)
   20401            0 :         unspec = UNSPEC_RSQRT14;
   20402              :     }
   20403              : 
   20404              :   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
   20405              :      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
   20406              : 
   20407           85 :   a = force_reg (mode, a);
   20408              : 
   20409              :   /* x0 = rsqrt(a) estimate */
   20410           85 :   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
   20411              :                                               unspec)));
   20412              : 
   20413              :   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
   20414           85 :   if (!recip)
   20415              :     {
   20416           57 :       rtx zero = force_reg (mode, CONST0_RTX(mode));
   20417           57 :       rtx mask;
   20418              : 
   20419              :       /* Handle masked compare.  */
   20420          110 :       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
   20421              :         {
   20422            0 :           mask = gen_reg_rtx (HImode);
   20423              :           /* Imm value 0x4 corresponds to not-equal comparison.  */
   20424            0 :           emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
   20425            0 :           emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
   20426              :         }
   20427              :       else
   20428              :         {
   20429           57 :           mask = gen_reg_rtx (mode);
   20430           57 :           emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
   20431           57 :           emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
   20432              :         }
   20433              :     }
   20434              : 
   20435           85 :   mthree = force_reg (mode, mthree);
   20436              : 
   20437              :   /* e0 = x0 * a */
   20438           85 :   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
   20439              : 
   20440           85 :   unsigned vector_size = GET_MODE_SIZE (mode);
   20441           85 :   if (TARGET_FMA
   20442           77 :       || (TARGET_AVX512F && vector_size == 64)
   20443           77 :       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
   20444           16 :     emit_insn (gen_rtx_SET (e2,
   20445              :                             gen_rtx_FMA (mode, e0, x0, mthree)));
   20446              :   else
   20447              :     {
   20448              :       /* e1 = e0 * x0 */
   20449           69 :       emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
   20450              : 
   20451              :       /* e2 = e1 - 3. */
   20452           69 :       emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
   20453              :     }
   20454              : 
   20455           85 :   mhalf = force_reg (mode, mhalf);
   20456           85 :   if (recip)
   20457              :     /* e3 = -.5 * x0 */
   20458           28 :     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
   20459              :   else
   20460              :     /* e3 = -.5 * e0 */
   20461           57 :     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
   20462              :   /* ret = e2 * e3 */
   20463           85 :   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
   20464           85 : }
   20465              : 
   20466              : /* Expand fabs (OP0) and return a new rtx that holds the result.  The
   20467              :    mask for masking out the sign-bit is stored in *SMASK, if that is
   20468              :    non-null.  */
   20469              : 
   20470              : static rtx
   20471         1049 : ix86_expand_sse_fabs (rtx op0, rtx *smask)
   20472              : {
   20473         1049 :   machine_mode vmode, mode = GET_MODE (op0);
   20474         1049 :   rtx xa, mask;
   20475              : 
   20476         1049 :   xa = gen_reg_rtx (mode);
   20477         1049 :   if (mode == SFmode)
   20478              :     vmode = V4SFmode;
   20479          467 :   else if (mode == DFmode)
   20480              :     vmode = V2DFmode;
   20481              :   else
   20482            0 :     vmode = mode;
   20483         1049 :   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
   20484         1049 :   if (!VECTOR_MODE_P (mode))
   20485              :     {
   20486              :       /* We need to generate a scalar mode mask in this case.  */
   20487         1049 :       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   20488         1049 :       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
   20489         1049 :       mask = gen_reg_rtx (mode);
   20490         1049 :       emit_insn (gen_rtx_SET (mask, tmp));
   20491              :     }
   20492         1049 :   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
   20493              : 
   20494         1049 :   if (smask)
   20495          996 :     *smask = mask;
   20496              : 
   20497         1049 :   return xa;
   20498              : }
   20499              : 
   20500              : /* Expands a comparison of OP0 with OP1 using comparison code CODE,
   20501              :    swapping the operands if SWAP_OPERANDS is true.  The expanded
   20502              :    code is a forward jump to a newly created label in case the
   20503              :    comparison is true.  The generated label rtx is returned.  */
   20504              : static rtx_code_label *
   20505         1064 : ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
   20506              :                                   bool swap_operands)
   20507              : {
   20508         1064 :   bool unordered_compare = ix86_unordered_fp_compare (code);
   20509         1064 :   rtx_code_label *label;
   20510         1064 :   rtx tmp, reg;
   20511              : 
   20512         1064 :   if (swap_operands)
   20513           34 :     std::swap (op0, op1);
   20514              : 
   20515         1064 :   label = gen_label_rtx ();
   20516         1064 :   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
   20517         1064 :   if (unordered_compare)
   20518          908 :     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
   20519         1064 :   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
   20520         1064 :   emit_insn (gen_rtx_SET (reg, tmp));
   20521         1064 :   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
   20522         1064 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   20523              :                               gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
   20524         1064 :   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20525         1064 :   JUMP_LABEL (tmp) = label;
   20526              : 
   20527         1064 :   return label;
   20528              : }
   20529              : 
   20530              : /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
   20531              :    using comparison code CODE.  Operands are swapped for the comparison if
   20532              :    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
   20533              : static rtx
   20534          541 : ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
   20535              :                               bool swap_operands)
   20536              : {
   20537          541 :   rtx (*insn)(rtx, rtx, rtx, rtx);
   20538          541 :   machine_mode mode = GET_MODE (op0);
   20539          541 :   rtx mask = gen_reg_rtx (mode);
   20540              : 
   20541          541 :   if (swap_operands)
   20542          362 :     std::swap (op0, op1);
   20543              : 
   20544          541 :   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
   20545              : 
   20546          541 :   emit_insn (insn (mask, op0, op1,
   20547              :                    gen_rtx_fmt_ee (code, mode, op0, op1)));
   20548          541 :   return mask;
   20549              : }
   20550              : 
   20551              : /* Expand copysign from SIGN to the positive value ABS_VALUE
   20552              :    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
   20553              :    the sign-bit.  */
   20554              : 
   20555              : static void
   20556         1016 : ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
   20557              : {
   20558         1016 :   machine_mode mode = GET_MODE (sign);
   20559         1016 :   rtx sgn = gen_reg_rtx (mode);
   20560         1016 :   if (mask == NULL_RTX)
   20561              :     {
   20562           28 :       machine_mode vmode;
   20563              : 
   20564           28 :       if (mode == SFmode)
   20565              :         vmode = V4SFmode;
   20566              :       else if (mode == DFmode)
   20567              :         vmode = V2DFmode;
   20568              :       else if (mode == HFmode)
   20569              :         vmode = V8HFmode;
   20570              :       else
   20571           28 :         vmode = mode;
   20572              : 
   20573           28 :       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
   20574           28 :       if (!VECTOR_MODE_P (mode))
   20575              :         {
   20576              :           /* We need to generate a scalar mode mask in this case.  */
   20577           28 :           rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   20578           28 :           tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
   20579           28 :           mask = gen_reg_rtx (mode);
   20580           28 :           emit_insn (gen_rtx_SET (mask, tmp));
   20581              :         }
   20582              :     }
   20583              :   else
   20584          988 :     mask = gen_rtx_NOT (mode, mask);
   20585         1016 :   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
   20586         1016 :   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
   20587         1016 : }
   20588              : 
   20589              : /* Expand SSE sequence for computing lround from OP1 storing
   20590              :    into OP0.  */
   20591              : 
   20592              : void
   20593           28 : ix86_expand_lround (rtx op0, rtx op1)
   20594              : {
   20595              :   /* C code for the stuff we're doing below:
   20596              :         tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
   20597              :         return (long)tmp;
   20598              :    */
   20599           28 :   machine_mode mode = GET_MODE (op1);
   20600           28 :   const struct real_format *fmt;
   20601           28 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   20602           28 :   rtx adj;
   20603              : 
   20604              :   /* load nextafter (0.5, 0.0) */
   20605           28 :   fmt = REAL_MODE_FORMAT (mode);
   20606           28 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   20607           28 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   20608              : 
   20609              :   /* adj = copysign (0.5, op1) */
   20610           28 :   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
   20611           28 :   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
   20612              : 
   20613              :   /* adj = op1 + adj */
   20614           28 :   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
   20615              : 
   20616              :   /* op0 = (imode)adj */
   20617           28 :   expand_fix (op0, adj, 0);
   20618           28 : }
   20619              : 
   20620              : /* Expand SSE2 sequence for computing lround from OPERAND1 storing
   20621              :    into OPERAND0.  */
   20622              : 
   20623              : void
   20624           68 : ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
   20625              : {
   20626              :   /* C code for the stuff we're doing below (for do_floor):
   20627              :         xi = (long)op1;
   20628              :         xi -= (double)xi > op1 ? 1 : 0;
   20629              :         return xi;
   20630              :    */
   20631           68 :   machine_mode fmode = GET_MODE (op1);
   20632           68 :   machine_mode imode = GET_MODE (op0);
   20633           68 :   rtx ireg, freg, tmp;
   20634           68 :   rtx_code_label *label;
   20635              : 
   20636              :   /* reg = (long)op1 */
   20637           68 :   ireg = gen_reg_rtx (imode);
   20638           68 :   expand_fix (ireg, op1, 0);
   20639              : 
   20640              :   /* freg = (double)reg */
   20641           68 :   freg = gen_reg_rtx (fmode);
   20642           68 :   expand_float (freg, ireg, 0);
   20643              : 
   20644              :   /* ireg = (freg > op1) ? ireg - 1 : ireg */
   20645          136 :   label = ix86_expand_sse_compare_and_jump (UNLE,
   20646           68 :                                             freg, op1, !do_floor);
   20647          102 :   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
   20648              :                              ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
   20649           68 :   emit_move_insn (ireg, tmp);
   20650              : 
   20651           68 :   emit_label (label);
   20652           68 :   LABEL_NUSES (label) = 1;
   20653              : 
   20654           68 :   emit_move_insn (op0, ireg);
   20655           68 : }
   20656              : 
   20657              : /* Generate and return a rtx of mode MODE for 2**n where n is the number
   20658              :    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
   20659              : 
   20660              : static rtx
   20661          996 : ix86_gen_TWO52 (machine_mode mode)
   20662              : {
   20663          996 :   const struct real_format *fmt;
   20664          996 :   REAL_VALUE_TYPE TWO52r;
   20665          996 :   rtx TWO52;
   20666              : 
   20667          996 :   fmt = REAL_MODE_FORMAT (mode);
   20668          996 :   real_2expN (&TWO52r, fmt->p - 1, mode);
   20669          996 :   TWO52 = const_double_from_real_value (TWO52r, mode);
   20670          996 :   TWO52 = force_reg (mode, TWO52);
   20671              : 
   20672          996 :   return TWO52;
   20673              : }
   20674              : 
   20675              : /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
   20676              : 
   20677              : void
   20678          122 : ix86_expand_rint (rtx operand0, rtx operand1)
   20679              : {
   20680              :   /* C code for the stuff we're doing below:
   20681              :         xa = fabs (operand1);
   20682              :         if (!isless (xa, 2**52))
   20683              :           return operand1;
   20684              :         two52 = 2**52;
   20685              :         if (flag_rounding_math)
   20686              :           {
   20687              :             two52 = copysign (two52, operand1);
   20688              :             xa = operand1;
   20689              :           }
   20690              :         xa = xa + two52 - two52;
   20691              :         return copysign (xa, operand1);
   20692              :    */
   20693          122 :   machine_mode mode = GET_MODE (operand0);
   20694          122 :   rtx res, xa, TWO52, mask;
   20695          122 :   rtx_code_label *label;
   20696              : 
   20697          122 :   TWO52 = ix86_gen_TWO52 (mode);
   20698              : 
   20699              :   /* Temporary for holding the result, initialized to the input
   20700              :      operand to ease control flow.  */
   20701          122 :   res = copy_to_reg (operand1);
   20702              : 
   20703              :   /* xa = abs (operand1) */
   20704          122 :   xa = ix86_expand_sse_fabs (res, &mask);
   20705              : 
   20706              :   /* if (!isless (xa, TWO52)) goto label; */
   20707          122 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20708              : 
   20709          122 :   if (flag_rounding_math)
   20710              :     {
   20711           53 :       ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
   20712           53 :       xa = res;
   20713              :     }
   20714              : 
   20715          122 :   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   20716          122 :   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
   20717              : 
   20718              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20719          122 :   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
   20720           53 :     xa = ix86_expand_sse_fabs (xa, NULL);
   20721              : 
   20722          122 :   ix86_sse_copysign_to_positive (res, xa, res, mask);
   20723              : 
   20724          122 :   emit_label (label);
   20725          122 :   LABEL_NUSES (label) = 1;
   20726              : 
   20727          122 :   emit_move_insn (operand0, res);
   20728          122 : }
   20729              : 
   20730              : /* Expand SSE2 sequence for computing floor or ceil
   20731              :    from OPERAND1 storing into OPERAND0.  */
   20732              : void
   20733          541 : ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
   20734              : {
   20735              :   /* C code for the stuff we expand below.
   20736              :         double xa = fabs (x), x2;
   20737              :         if (!isless (xa, TWO52))
   20738              :           return x;
   20739              :         x2 = (double)(long)x;
   20740              : 
   20741              :      Compensate.  Floor:
   20742              :         if (x2 > x)
   20743              :           x2 -= 1;
   20744              :      Compensate.  Ceil:
   20745              :         if (x2 < x)
   20746              :           x2 += 1;
   20747              : 
   20748              :         if (HONOR_SIGNED_ZEROS (mode))
   20749              :           return copysign (x2, x);
   20750              :         return x2;
   20751              :    */
   20752          541 :   machine_mode mode = GET_MODE (operand0);
   20753          541 :   rtx xa, xi, TWO52, tmp, one, res, mask;
   20754          541 :   rtx_code_label *label;
   20755              : 
   20756          541 :   TWO52 = ix86_gen_TWO52 (mode);
   20757              : 
   20758              :   /* Temporary for holding the result, initialized to the input
   20759              :      operand to ease control flow.  */
   20760          541 :   res = copy_to_reg (operand1);
   20761              : 
   20762              :   /* xa = abs (operand1) */
   20763          541 :   xa = ix86_expand_sse_fabs (res, &mask);
   20764              : 
   20765              :   /* if (!isless (xa, TWO52)) goto label; */
   20766          541 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20767              : 
   20768              :   /* xa = (double)(long)x */
   20769          541 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   20770          541 :   expand_fix (xi, res, 0);
   20771          541 :   expand_float (xa, xi, 0);
   20772              : 
   20773              :   /* generate 1.0 */
   20774          541 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   20775              : 
   20776              :   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
   20777          541 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
   20778          541 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   20779          903 :   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
   20780              :                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   20781          541 :   if (HONOR_SIGNED_ZEROS (mode))
   20782              :     {
   20783              :       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20784          494 :       if (do_floor && flag_rounding_math)
   20785            0 :         tmp = ix86_expand_sse_fabs (tmp, NULL);
   20786              : 
   20787          494 :       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
   20788              :     }
   20789          541 :   emit_move_insn (res, tmp);
   20790              : 
   20791          541 :   emit_label (label);
   20792          541 :   LABEL_NUSES (label) = 1;
   20793              : 
   20794          541 :   emit_move_insn (operand0, res);
   20795          541 : }
   20796              : 
   20797              : /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
   20798              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   20799              :    that is only available on 64bit targets.  */
   20800              : void
   20801            0 : ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
   20802              : {
   20803              :   /* C code for the stuff we expand below.
   20804              :         double xa = fabs (x), x2;
   20805              :         if (!isless (xa, TWO52))
   20806              :           return x;
   20807              :         xa = xa + TWO52 - TWO52;
   20808              :         x2 = copysign (xa, x);
   20809              : 
   20810              :      Compensate.  Floor:
   20811              :         if (x2 > x)
   20812              :           x2 -= 1;
   20813              :      Compensate.  Ceil:
   20814              :         if (x2 < x)
   20815              :           x2 += 1;
   20816              : 
   20817              :         if (HONOR_SIGNED_ZEROS (mode))
   20818              :           x2 = copysign (x2, x);
   20819              :         return x2;
   20820              :    */
   20821            0 :   machine_mode mode = GET_MODE (operand0);
   20822            0 :   rtx xa, TWO52, tmp, one, res, mask;
   20823            0 :   rtx_code_label *label;
   20824              : 
   20825            0 :   TWO52 = ix86_gen_TWO52 (mode);
   20826              : 
   20827              :   /* Temporary for holding the result, initialized to the input
   20828              :      operand to ease control flow.  */
   20829            0 :   res = copy_to_reg (operand1);
   20830              : 
   20831              :   /* xa = abs (operand1) */
   20832            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   20833              : 
   20834              :   /* if (!isless (xa, TWO52)) goto label; */
   20835            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20836              : 
   20837              :   /* xa = xa + TWO52 - TWO52; */
   20838            0 :   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   20839            0 :   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
   20840              : 
   20841              :   /* xa = copysign (xa, operand1) */
   20842            0 :   ix86_sse_copysign_to_positive (xa, xa, res, mask);
   20843              : 
   20844              :   /* generate 1.0 */
   20845            0 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   20846              : 
   20847              :   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
   20848            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
   20849            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   20850            0 :   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
   20851              :                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   20852            0 :   if (HONOR_SIGNED_ZEROS (mode))
   20853              :     {
   20854              :       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20855            0 :       if (do_floor && flag_rounding_math)
   20856            0 :         tmp = ix86_expand_sse_fabs (tmp, NULL);
   20857              : 
   20858            0 :       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
   20859              :     }
   20860            0 :   emit_move_insn (res, tmp);
   20861              : 
   20862            0 :   emit_label (label);
   20863            0 :   LABEL_NUSES (label) = 1;
   20864              : 
   20865            0 :   emit_move_insn (operand0, res);
   20866            0 : }
   20867              : 
   20868              : /* Expand SSE sequence for computing trunc
   20869              :    from OPERAND1 storing into OPERAND0.  */
   20870              : void
   20871          319 : ix86_expand_trunc (rtx operand0, rtx operand1)
   20872              : {
   20873              :   /* C code for SSE variant we expand below.
   20874              :         double xa = fabs (x), x2;
   20875              :         if (!isless (xa, TWO52))
   20876              :           return x;
   20877              :         x2 = (double)(long)x;
   20878              :         if (HONOR_SIGNED_ZEROS (mode))
   20879              :           return copysign (x2, x);
   20880              :         return x2;
   20881              :    */
   20882          319 :   machine_mode mode = GET_MODE (operand0);
   20883          319 :   rtx xa, xi, TWO52, res, mask;
   20884          319 :   rtx_code_label *label;
   20885              : 
   20886          319 :   TWO52 = ix86_gen_TWO52 (mode);
   20887              : 
   20888              :   /* Temporary for holding the result, initialized to the input
   20889              :      operand to ease control flow.  */
   20890          319 :   res = copy_to_reg (operand1);
   20891              : 
   20892              :   /* xa = abs (operand1) */
   20893          319 :   xa = ix86_expand_sse_fabs (res, &mask);
   20894              : 
   20895              :   /* if (!isless (xa, TWO52)) goto label; */
   20896          319 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20897              : 
   20898              :   /* xa = (double)(long)x */
   20899          319 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   20900          319 :   expand_fix (xi, res, 0);
   20901          319 :   expand_float (xa, xi, 0);
   20902              : 
   20903          319 :   if (HONOR_SIGNED_ZEROS (mode))
   20904          305 :     ix86_sse_copysign_to_positive (xa, xa, res, mask);
   20905              : 
   20906          319 :   emit_move_insn (res, xa);
   20907              : 
   20908          319 :   emit_label (label);
   20909          319 :   LABEL_NUSES (label) = 1;
   20910              : 
   20911          319 :   emit_move_insn (operand0, res);
   20912          319 : }
   20913              : 
   20914              : /* Expand SSE sequence for computing trunc from OPERAND1 storing
   20915              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   20916              :    that is only available on 64bit targets.  */
   20917              : void
   20918            0 : ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
   20919              : {
   20920            0 :   machine_mode mode = GET_MODE (operand0);
   20921            0 :   rtx xa, xa2, TWO52, tmp, one, res, mask;
   20922            0 :   rtx_code_label *label;
   20923              : 
   20924              :   /* C code for SSE variant we expand below.
   20925              :         double xa = fabs (x), x2;
   20926              :         if (!isless (xa, TWO52))
   20927              :           return x;
   20928              :         xa2 = xa + TWO52 - TWO52;
   20929              :      Compensate:
   20930              :         if (xa2 > xa)
   20931              :           xa2 -= 1.0;
   20932              :         x2 = copysign (xa2, x);
   20933              :         return x2;
   20934              :    */
   20935              : 
   20936            0 :   TWO52 = ix86_gen_TWO52 (mode);
   20937              : 
   20938              :   /* Temporary for holding the result, initialized to the input
   20939              :      operand to ease control flow.  */
   20940            0 :   res =copy_to_reg (operand1);
   20941              : 
   20942              :   /* xa = abs (operand1) */
   20943            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   20944              : 
   20945              :   /* if (!isless (xa, TWO52)) goto label; */
   20946            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20947              : 
   20948              :   /* xa2 = xa + TWO52 - TWO52; */
   20949            0 :   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   20950            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
   20951              : 
   20952              :   /* generate 1.0 */
   20953            0 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   20954              : 
   20955              :   /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
   20956            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
   20957            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   20958            0 :   tmp = expand_simple_binop (mode, MINUS,
   20959              :                              xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   20960              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20961            0 :   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
   20962            0 :     tmp = ix86_expand_sse_fabs (tmp, NULL);
   20963              : 
   20964              :   /* res = copysign (xa2, operand1) */
   20965            0 :   ix86_sse_copysign_to_positive (res, tmp, res, mask);
   20966              : 
   20967            0 :   emit_label (label);
   20968            0 :   LABEL_NUSES (label) = 1;
   20969              : 
   20970            0 :   emit_move_insn (operand0, res);
   20971            0 : }
   20972              : 
   20973              : /* Expand SSE sequence for computing round
   20974              :    from OPERAND1 storing into OPERAND0.  */
   20975              : void
   20976           14 : ix86_expand_round (rtx operand0, rtx operand1)
   20977              : {
   20978              :   /* C code for the stuff we're doing below:
   20979              :         double xa = fabs (x);
   20980              :         if (!isless (xa, TWO52))
   20981              :           return x;
   20982              :         xa = (double)(long)(xa + nextafter (0.5, 0.0));
   20983              :         return copysign (xa, x);
   20984              :    */
   20985           14 :   machine_mode mode = GET_MODE (operand0);
   20986           14 :   rtx res, TWO52, xa, xi, half, mask;
   20987           14 :   rtx_code_label *label;
   20988           14 :   const struct real_format *fmt;
   20989           14 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   20990              : 
   20991              :   /* Temporary for holding the result, initialized to the input
   20992              :      operand to ease control flow.  */
   20993           14 :   res = copy_to_reg (operand1);
   20994              : 
   20995           14 :   TWO52 = ix86_gen_TWO52 (mode);
   20996           14 :   xa = ix86_expand_sse_fabs (res, &mask);
   20997           14 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20998              : 
   20999              :   /* load nextafter (0.5, 0.0) */
   21000           14 :   fmt = REAL_MODE_FORMAT (mode);
   21001           14 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   21002           14 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   21003              : 
   21004              :   /* xa = xa + 0.5 */
   21005           14 :   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
   21006           14 :   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
   21007              : 
   21008              :   /* xa = (double)(int64_t)xa */
   21009           14 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   21010           14 :   expand_fix (xi, xa, 0);
   21011           14 :   expand_float (xa, xi, 0);
   21012              : 
   21013              :   /* res = copysign (xa, operand1) */
   21014           14 :   ix86_sse_copysign_to_positive (res, xa, res, mask);
   21015              : 
   21016           14 :   emit_label (label);
   21017           14 :   LABEL_NUSES (label) = 1;
   21018              : 
   21019           14 :   emit_move_insn (operand0, res);
   21020           14 : }
   21021              : 
   21022              : /* Expand SSE sequence for computing round from OPERAND1 storing
   21023              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   21024              :    that is only available on 64bit targets.  */
   21025              : void
   21026            0 : ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
   21027              : {
   21028              :   /* C code for the stuff we expand below.
   21029              :         double xa = fabs (x), xa2, x2;
   21030              :         if (!isless (xa, TWO52))
   21031              :           return x;
   21032              :      Using the absolute value and copying back sign makes
   21033              :      -0.0 -> -0.0 correct.
   21034              :         xa2 = xa + TWO52 - TWO52;
   21035              :      Compensate.
   21036              :         dxa = xa2 - xa;
   21037              :         if (dxa <= -0.5)
   21038              :           xa2 += 1;
   21039              :         else if (dxa > 0.5)
   21040              :           xa2 -= 1;
   21041              :         x2 = copysign (xa2, x);
   21042              :         return x2;
   21043              :    */
   21044            0 :   machine_mode mode = GET_MODE (operand0);
   21045            0 :   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
   21046            0 :   rtx_code_label *label;
   21047              : 
   21048            0 :   TWO52 = ix86_gen_TWO52 (mode);
   21049              : 
   21050              :   /* Temporary for holding the result, initialized to the input
   21051              :      operand to ease control flow.  */
   21052            0 :   res = copy_to_reg (operand1);
   21053              : 
   21054              :   /* xa = abs (operand1) */
   21055            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   21056              : 
   21057              :   /* if (!isless (xa, TWO52)) goto label; */
   21058            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   21059              : 
   21060              :   /* xa2 = xa + TWO52 - TWO52; */
   21061            0 :   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   21062            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
   21063              : 
   21064              :   /* dxa = xa2 - xa; */
   21065            0 :   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
   21066              : 
   21067              :   /* generate 0.5, 1.0 and -0.5 */
   21068            0 :   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
   21069            0 :   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
   21070            0 :   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
   21071              :                                0, OPTAB_DIRECT);
   21072              : 
   21073              :   /* Compensate.  */
   21074              :   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
   21075            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
   21076            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   21077            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   21078              :   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
   21079            0 :   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
   21080            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   21081            0 :   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   21082              : 
   21083              :   /* res = copysign (xa2, operand1) */
   21084            0 :   ix86_sse_copysign_to_positive (res, xa2, res, mask);
   21085              : 
   21086            0 :   emit_label (label);
   21087            0 :   LABEL_NUSES (label) = 1;
   21088              : 
   21089            0 :   emit_move_insn (operand0, res);
   21090            0 : }
   21091              : 
   21092              : /* Expand SSE sequence for computing round
   21093              :    from OP1 storing into OP0 using sse4 round insn.  */
   21094              : void
   21095            9 : ix86_expand_round_sse4 (rtx op0, rtx op1)
   21096              : {
   21097            9 :   machine_mode mode = GET_MODE (op0);
   21098            9 :   rtx e1, e2, res, half;
   21099            9 :   const struct real_format *fmt;
   21100            9 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   21101            9 :   rtx (*gen_copysign) (rtx, rtx, rtx);
   21102            9 :   rtx (*gen_round) (rtx, rtx, rtx);
   21103              : 
   21104            9 :   switch (mode)
   21105              :     {
   21106              :     case E_HFmode:
   21107              :       gen_copysign = gen_copysignhf3;
   21108              :       gen_round = gen_sse4_1_roundhf2;
   21109              :       break;
   21110            4 :     case E_SFmode:
   21111            4 :       gen_copysign = gen_copysignsf3;
   21112            4 :       gen_round = gen_sse4_1_roundsf2;
   21113            4 :       break;
   21114            4 :     case E_DFmode:
   21115            4 :       gen_copysign = gen_copysigndf3;
   21116            4 :       gen_round = gen_sse4_1_rounddf2;
   21117            4 :       break;
   21118            0 :     default:
   21119            0 :       gcc_unreachable ();
   21120              :     }
   21121              : 
   21122              :   /* round (a) = trunc (a + copysign (0.5, a)) */
   21123              : 
   21124              :   /* load nextafter (0.5, 0.0) */
   21125            9 :   fmt = REAL_MODE_FORMAT (mode);
   21126            9 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   21127            9 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   21128            9 :   half = const_double_from_real_value (pred_half, mode);
   21129              : 
   21130              :   /* e1 = copysign (0.5, op1) */
   21131            9 :   e1 = gen_reg_rtx (mode);
   21132            9 :   emit_insn (gen_copysign (e1, half, op1));
   21133              : 
   21134              :   /* e2 = op1 + e1 */
   21135            9 :   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
   21136              : 
   21137              :   /* res = trunc (e2) */
   21138            9 :   res = gen_reg_rtx (mode);
   21139            9 :   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
   21140              : 
   21141            9 :   emit_move_insn (op0, res);
   21142            9 : }
   21143              : 
   21144              : /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
   21145              :    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
   21146              :    insn every time.  */
   21147              : 
   21148              : static GTY(()) rtx_insn *vselect_insn;
   21149              : 
   21150              : /* Initialize vselect_insn.  */
   21151              : 
   21152              : static void
   21153         7514 : init_vselect_insn (void)
   21154              : {
   21155         7514 :   unsigned i;
   21156         7514 :   rtx x;
   21157              : 
   21158         7514 :   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
   21159       488410 :   for (i = 0; i < MAX_VECT_LEN; ++i)
   21160       480896 :     XVECEXP (x, 0, i) = const0_rtx;
   21161         7514 :   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
   21162              :                                                         const0_rtx), x);
   21163         7514 :   x = gen_rtx_SET (const0_rtx, x);
   21164         7514 :   start_sequence ();
   21165         7514 :   vselect_insn = emit_insn (x);
   21166         7514 :   end_sequence ();
   21167         7514 : }
   21168              : 
   21169              : /* Construct (set target (vec_select op0 (parallel perm))) and
   21170              :    return true if that's a valid instruction in the active ISA.  */
   21171              : 
   21172              : static bool
   21173       540287 : expand_vselect (rtx target, rtx op0, const unsigned char *perm,
   21174              :                 unsigned nelt, bool testing_p)
   21175              : {
   21176       540287 :   unsigned int i;
   21177       540287 :   rtx x, save_vconcat;
   21178       540287 :   int icode;
   21179              : 
   21180       540287 :   if (vselect_insn == NULL_RTX)
   21181         1685 :     init_vselect_insn ();
   21182              : 
   21183       540287 :   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
   21184       540287 :   PUT_NUM_ELEM (XVEC (x, 0), nelt);
   21185      4237163 :   for (i = 0; i < nelt; ++i)
   21186      3696876 :     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
   21187       540287 :   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
   21188       540287 :   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
   21189       540287 :   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
   21190       540287 :   SET_DEST (PATTERN (vselect_insn)) = target;
   21191       540287 :   icode = recog_memoized (vselect_insn);
   21192              : 
   21193       540287 :   if (icode >= 0 && !testing_p)
   21194        72563 :     emit_insn (copy_rtx (PATTERN (vselect_insn)));
   21195              : 
   21196       540287 :   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
   21197       540287 :   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
   21198       540287 :   INSN_CODE (vselect_insn) = -1;
   21199              : 
   21200       540287 :   return icode >= 0;
   21201              : }
   21202              : 
   21203              : /* Similar, but generate a vec_concat from op0 and op1 as well.  */
   21204              : 
   21205              : static bool
   21206       473636 : expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   21207              :                         const unsigned char *perm, unsigned nelt,
   21208              :                         bool testing_p)
   21209              : {
   21210       473636 :   machine_mode v2mode;
   21211       473636 :   rtx x;
   21212       473636 :   bool ok;
   21213              : 
   21214       473636 :   if (vselect_insn == NULL_RTX)
   21215         5829 :     init_vselect_insn ();
   21216              : 
   21217       473636 :   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
   21218              :     return false;
   21219       473636 :   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
   21220       473636 :   PUT_MODE (x, v2mode);
   21221       473636 :   XEXP (x, 0) = op0;
   21222       473636 :   XEXP (x, 1) = op1;
   21223       473636 :   ok = expand_vselect (target, x, perm, nelt, testing_p);
   21224       473636 :   XEXP (x, 0) = const0_rtx;
   21225       473636 :   XEXP (x, 1) = const0_rtx;
   21226       473636 :   return ok;
   21227              : }
   21228              : 
   21229              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21230              :    using movss or movsd.  */
   21231              : static bool
   21232       317242 : expand_vec_perm_movs (struct expand_vec_perm_d *d)
   21233              : {
   21234       317242 :   machine_mode vmode = d->vmode;
   21235       317242 :   unsigned i, nelt = d->nelt;
   21236       317242 :   rtx x;
   21237              : 
   21238       317242 :   if (d->one_operand_p)
   21239              :     return false;
   21240              : 
   21241       290387 :   if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
   21242       140485 :       && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
   21243        85383 :       && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
   21244              :     return false;
   21245              : 
   21246              :   /* Only the first element is changed.  */
   21247       214071 :   if (d->perm[0] != nelt && d->perm[0] != 0)
   21248              :     return false;
   21249       160124 :   for (i = 1; i < nelt; ++i)
   21250       124987 :     if (d->perm[i] != i + nelt - d->perm[0])
   21251              :       return false;
   21252              : 
   21253        35137 :   if (d->testing_p)
   21254              :     return true;
   21255              : 
   21256         6531 :   if (d->perm[0] == nelt)
   21257            0 :     x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
   21258              :   else
   21259         6531 :     x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
   21260              : 
   21261         6531 :   emit_insn (gen_rtx_SET (d->target, x));
   21262              : 
   21263         6531 :   return true;
   21264              : }
   21265              : 
   21266              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21267              :    using insertps.  */
   21268              : static bool
   21269       282105 : expand_vec_perm_insertps (struct expand_vec_perm_d *d)
   21270              : {
   21271       282105 :   machine_mode vmode = d->vmode;
   21272       282105 :   unsigned i, cnt_s, nelt = d->nelt;
   21273       282105 :   int cnt_d = -1;
   21274       282105 :   rtx src, dst;
   21275              : 
   21276       282105 :   if (d->one_operand_p)
   21277              :     return false;
   21278              : 
   21279       255250 :   if (!(TARGET_SSE4_1
   21280        37527 :         && (vmode == V4SFmode || vmode == V4SImode
   21281        27282 :             || (TARGET_MMX_WITH_SSE
   21282        19742 :                 && (vmode == V2SFmode || vmode == V2SImode)))))
   21283              :     return false;
   21284              : 
   21285        51004 :   for (i = 0; i < nelt; ++i)
   21286              :     {
   21287        48757 :       if (d->perm[i] == i)
   21288         9630 :         continue;
   21289        39127 :       if (cnt_d != -1)
   21290              :         {
   21291              :           cnt_d = -1;
   21292              :           break;
   21293              :         }
   21294        20687 :       cnt_d = i;
   21295              :     }
   21296              : 
   21297        20687 :   if (cnt_d == -1)
   21298              :     {
   21299        41056 :       for (i = 0; i < nelt; ++i)
   21300              :         {
   21301        39123 :           if (d->perm[i] == i + nelt)
   21302         4176 :             continue;
   21303        34947 :           if (cnt_d != -1)
   21304              :             return false;
   21305        18440 :           cnt_d = i;
   21306              :         }
   21307              : 
   21308         1933 :       if (cnt_d == -1)
   21309              :         return false;
   21310              :     }
   21311              : 
   21312         4180 :   if (d->testing_p)
   21313              :     return true;
   21314              : 
   21315          550 :   gcc_assert (cnt_d != -1);
   21316              : 
   21317          550 :   cnt_s = d->perm[cnt_d];
   21318          550 :   if (cnt_s < nelt)
   21319              :     {
   21320          241 :       src = d->op0;
   21321          241 :       dst = d->op1;
   21322              :     }
   21323              :   else
   21324              :     {
   21325          309 :       cnt_s -= nelt;
   21326          309 :       src = d->op1;
   21327          309 :       dst = d->op0;
   21328              :      }
   21329          550 :   gcc_assert (cnt_s < nelt);
   21330              : 
   21331          550 :   rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
   21332          550 :                                GEN_INT (cnt_s << 6 | cnt_d << 4));
   21333          550 :   emit_insn (x);
   21334              : 
   21335          550 :   return true;
   21336              : }
   21337              : 
   21338              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21339              :    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
   21340              : 
   21341              : static bool
   21342       321569 : expand_vec_perm_blend (struct expand_vec_perm_d *d)
   21343              : {
   21344       321569 :   machine_mode mmode, vmode = d->vmode;
   21345       321569 :   unsigned i, nelt = d->nelt;
   21346       321569 :   unsigned HOST_WIDE_INT mask;
   21347       321569 :   rtx target, op0, op1, maskop, x;
   21348       321569 :   rtx rperm[32], vperm;
   21349              : 
   21350       321569 :   if (d->one_operand_p)
   21351              :     return false;
   21352         6071 :   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
   21353       295865 :       && (TARGET_AVX512BW
   21354          691 :           || GET_MODE_UNIT_SIZE (vmode) >= 4))
   21355              :     ;
   21356       305778 :   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   21357              :     ;
   21358       286612 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   21359              :     ;
   21360       280029 :   else if (TARGET_SSE4_1
   21361       312252 :            && (GET_MODE_SIZE (vmode) == 16
   21362        22332 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   21363         3073 :                || GET_MODE_SIZE (vmode) == 4))
   21364              :     ;
   21365              :   else
   21366              :     return false;
   21367              : 
   21368              :   /* This is a blend, not a permute.  Elements must stay in their
   21369              :      respective lanes.  */
   21370       106375 :   for (i = 0; i < nelt; ++i)
   21371              :     {
   21372       102048 :       unsigned e = d->perm[i];
   21373       102048 :       if (!(e == i || e == i + nelt))
   21374              :         return false;
   21375              :     }
   21376              : 
   21377         4327 :   if (d->testing_p)
   21378              :     return true;
   21379              : 
   21380              :   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
   21381              :      decision should be extracted elsewhere, so that we only try that
   21382              :      sequence once all budget==3 options have been tried.  */
   21383         3013 :   target = d->target;
   21384         3013 :   op0 = d->op0;
   21385         3013 :   op1 = d->op1;
   21386         3013 :   mask = 0;
   21387              : 
   21388         3013 :   switch (vmode)
   21389              :     {
   21390              :     case E_V8DFmode:
   21391              :     case E_V16SFmode:
   21392              :     case E_V4DFmode:
   21393              :     case E_V8SFmode:
   21394              :     case E_V2DFmode:
   21395              :     case E_V4SFmode:
   21396              :     case E_V2SFmode:
   21397              :     case E_V2HImode:
   21398              :     case E_V4HImode:
   21399              :     case E_V8HImode:
   21400              :     case E_V8SImode:
   21401              :     case E_V32HImode:
   21402              :     case E_V64QImode:
   21403              :     case E_V16SImode:
   21404              :     case E_V8DImode:
   21405        10948 :       for (i = 0; i < nelt; ++i)
   21406         9462 :         mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
   21407              :       break;
   21408              : 
   21409              :     case E_V2DImode:
   21410           18 :       for (i = 0; i < 2; ++i)
   21411           18 :         mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
   21412            6 :       vmode = V8HImode;
   21413            6 :       goto do_subreg;
   21414              : 
   21415              :     case E_V2SImode:
   21416           24 :       for (i = 0; i < 2; ++i)
   21417           24 :         mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
   21418            8 :       vmode = V4HImode;
   21419            8 :       goto do_subreg;
   21420              : 
   21421          871 :     case E_V4SImode:
   21422          871 :       if (TARGET_AVX2)
   21423              :         {
   21424              :           /* Use vpblendd instead of vpblendw.  */
   21425          185 :           for (i = 0; i < nelt; ++i)
   21426          148 :             mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
   21427              :           break;
   21428              :         }
   21429              :       else
   21430              :         {
   21431         4170 :           for (i = 0; i < 4; ++i)
   21432         5200 :             mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
   21433          834 :           vmode = V8HImode;
   21434          834 :           goto do_subreg;
   21435              :         }
   21436              : 
   21437              :     case E_V16QImode:
   21438              :       /* See if bytes move in pairs so we can use pblendw with
   21439              :          an immediate argument, rather than pblendvb with a vector
   21440              :          argument.  */
   21441          102 :       for (i = 0; i < 16; i += 2)
   21442          100 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21443              :           {
   21444           83 :           use_pblendvb:
   21445         3502 :             for (i = 0; i < nelt; ++i)
   21446         3212 :               rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
   21447              : 
   21448          290 :           finish_pblendvb:
   21449          291 :             vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
   21450          291 :             vperm = force_reg (vmode, vperm);
   21451              : 
   21452          582 :             if (GET_MODE_SIZE (vmode) == 4)
   21453          135 :               emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
   21454          312 :             else if (GET_MODE_SIZE (vmode) == 8)
   21455           40 :               emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
   21456          232 :             else if (GET_MODE_SIZE (vmode) == 16)
   21457           83 :               emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
   21458              :             else
   21459           33 :               emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
   21460          291 :             if (target != d->target)
   21461            1 :               emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21462          291 :             return true;
   21463              :           }
   21464              : 
   21465           18 :       for (i = 0; i < 8; ++i)
   21466           16 :         mask |= (d->perm[i * 2] >= 16) << i;
   21467              :       vmode = V8HImode;
   21468              :       /* FALLTHRU */
   21469              : 
   21470         1167 :     do_subreg:
   21471         1167 :       target = gen_reg_rtx (vmode);
   21472         1167 :       op0 = gen_lowpart (vmode, op0);
   21473         1167 :       op1 = gen_lowpart (vmode, op1);
   21474         1167 :       break;
   21475              : 
   21476              :     case E_V8QImode:
   21477           40 :       for (i = 0; i < 8; i += 2)
   21478           40 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21479           40 :           goto use_pblendvb;
   21480              : 
   21481            0 :       for (i = 0; i < 4; ++i)
   21482            0 :         mask |= (d->perm[i * 2] >= 8) << i;
   21483            0 :       vmode = V4HImode;
   21484            0 :       goto do_subreg;
   21485              : 
   21486              :     case E_V4QImode:
   21487          153 :       for (i = 0; i < 4; i += 2)
   21488          150 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21489          135 :           goto use_pblendvb;
   21490              : 
   21491            9 :       for (i = 0; i < 2; ++i)
   21492            6 :         mask |= (d->perm[i * 2] >= 4) << i;
   21493            3 :       vmode = V2HImode;
   21494            3 :       goto do_subreg;
   21495              : 
   21496              :     case E_V32QImode:
   21497              :       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
   21498         4928 :       for (i = 0; i < 32; i += 2)
   21499         4640 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21500           32 :           goto use_pblendvb;
   21501              :       /* See if bytes move in quadruplets.  If yes, vpblendd
   21502              :          with immediate can be used.  */
   21503         2592 :       for (i = 0; i < 32; i += 4)
   21504         2304 :         if (d->perm[i] + 2 != d->perm[i + 2])
   21505              :           break;
   21506          288 :       if (i < 32)
   21507              :         {
   21508              :           /* See if bytes move the same in both lanes.  If yes,
   21509              :              vpblendw with immediate can be used.  */
   21510            0 :           for (i = 0; i < 16; i += 2)
   21511            0 :             if (d->perm[i] + 16 != d->perm[i + 16])
   21512            0 :               goto use_pblendvb;
   21513              : 
   21514              :           /* Use vpblendw.  */
   21515            0 :           for (i = 0; i < 16; ++i)
   21516            0 :             mask |= (d->perm[i * 2] >= 32) << i;
   21517            0 :           vmode = V16HImode;
   21518            0 :           goto do_subreg;
   21519              :         }
   21520              : 
   21521              :       /* Use vpblendd.  */
   21522         2592 :       for (i = 0; i < 8; ++i)
   21523         2304 :         mask |= (d->perm[i * 4] >= 32) << i;
   21524          288 :       vmode = V8SImode;
   21525          288 :       goto do_subreg;
   21526              : 
   21527              :     case E_V16HImode:
   21528              :       /* See if words move in pairs.  If yes, vpblendd can be used.  */
   21529          186 :       for (i = 0; i < 16; i += 2)
   21530          169 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21531              :           break;
   21532           50 :       if (i < 16)
   21533              :         {
   21534              :           /* See if words move the same in both lanes.  If not,
   21535              :              vpblendvb must be used.  */
   21536          290 :           for (i = 0; i < 8; i++)
   21537          258 :             if (d->perm[i] + 8 != d->perm[i + 8])
   21538              :               {
   21539              :                 /* Use vpblendvb.  */
   21540           33 :                 for (i = 0; i < 32; ++i)
   21541           32 :                   rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
   21542              : 
   21543            1 :                 vmode = V32QImode;
   21544            1 :                 nelt = 32;
   21545            1 :                 target = gen_reg_rtx (vmode);
   21546            1 :                 op0 = gen_lowpart (vmode, op0);
   21547            1 :                 op1 = gen_lowpart (vmode, op1);
   21548            1 :                 goto finish_pblendvb;
   21549              :               }
   21550              : 
   21551              :           /* Use vpblendw.  */
   21552          544 :           for (i = 0; i < 16; ++i)
   21553          512 :             mask |= (d->perm[i] >= 16) << i;
   21554              :           break;
   21555              :         }
   21556              : 
   21557              :       /* Use vpblendd.  */
   21558          153 :       for (i = 0; i < 8; ++i)
   21559          136 :         mask |= (d->perm[i * 2] >= 16) << i;
   21560           17 :       vmode = V8SImode;
   21561           17 :       goto do_subreg;
   21562              : 
   21563              :     case E_V4DImode:
   21564              :       /* Use vpblendd.  */
   21565           45 :       for (i = 0; i < 4; ++i)
   21566           54 :         mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
   21567            9 :       vmode = V8SImode;
   21568            9 :       goto do_subreg;
   21569              : 
   21570            0 :     default:
   21571            0 :       gcc_unreachable ();
   21572              :     }
   21573              : 
   21574         2722 :   switch (vmode)
   21575              :     {
   21576              :     case E_V8DFmode:
   21577              :     case E_V8DImode:
   21578              :       mmode = QImode;
   21579              :       break;
   21580            5 :     case E_V16SFmode:
   21581            5 :     case E_V16SImode:
   21582            5 :       mmode = HImode;
   21583            5 :       break;
   21584            6 :     case E_V32HImode:
   21585            6 :       mmode = SImode;
   21586            6 :       break;
   21587            1 :     case E_V64QImode:
   21588            1 :       mmode = DImode;
   21589            1 :       break;
   21590              :     default:
   21591              :       mmode = VOIDmode;
   21592              :     }
   21593              : 
   21594              :   /* Canonicalize vec_merge.  */
   21595         2722 :   if (swap_commutative_operands_p (op1, op0)
   21596              :       /* Two operands have same precedence, then
   21597              :          first bit of mask select first operand.  */
   21598         2722 :       || (!swap_commutative_operands_p (op0, op1)
   21599         2722 :           && !(mask & 1)))
   21600              :     {
   21601         2715 :       unsigned n_elts = GET_MODE_NUNITS (vmode);
   21602         2715 :       std::swap (op0, op1);
   21603         2715 :       unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
   21604         2715 :       if (n_elts == HOST_BITS_PER_WIDE_INT)
   21605              :         mask_all  = -1;
   21606              :       else
   21607         2714 :         mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
   21608         2715 :       mask = ~mask & mask_all;
   21609              :     }
   21610              : 
   21611         2722 :   if (mmode != VOIDmode)
   21612           22 :     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
   21613              :   else
   21614         2700 :     maskop = GEN_INT (mask);
   21615              : 
   21616              :   /* This matches five different patterns with the different modes.  */
   21617         2722 :   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
   21618         2722 :   x = gen_rtx_SET (target, x);
   21619         2722 :   emit_insn (x);
   21620         2722 :   if (target != d->target)
   21621         1167 :     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21622              : 
   21623              :   return true;
   21624              : }
   21625              : 
   21626              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21627              :    in terms of the variable form of vpermilps.
   21628              : 
   21629              :    Note that we will have already failed the immediate input vpermilps,
   21630              :    which requires that the high and low part shuffle be identical; the
   21631              :    variable form doesn't require that.  */
   21632              : 
   21633              : static bool
   21634       137501 : expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
   21635              : {
   21636       137501 :   rtx rperm[8], vperm;
   21637       137501 :   unsigned i;
   21638              : 
   21639       137501 :   if (!TARGET_AVX || !d->one_operand_p
   21640        12421 :       || (d->vmode != V8SImode && d->vmode != V8SFmode))
   21641              :     return false;
   21642              : 
   21643              :   /* We can only permute within the 128-bit lane.  */
   21644        20283 :   for (i = 0; i < 8; ++i)
   21645              :     {
   21646        19345 :       unsigned e = d->perm[i];
   21647        19345 :       if (i < 4 ? e >= 4 : e < 4)
   21648              :         return false;
   21649              :     }
   21650              : 
   21651          938 :   if (d->testing_p)
   21652              :     return true;
   21653              : 
   21654          657 :   for (i = 0; i < 8; ++i)
   21655              :     {
   21656          584 :       unsigned e = d->perm[i];
   21657              : 
   21658              :       /* Within each 128-bit lane, the elements of op0 are numbered
   21659              :          from 0 and the elements of op1 are numbered from 4.  */
   21660          584 :       if (e >= 8 + 4)
   21661            0 :         e -= 8;
   21662          584 :       else if (e >= 4)
   21663          292 :         e -= 4;
   21664              : 
   21665          584 :       rperm[i] = GEN_INT (e);
   21666              :     }
   21667              : 
   21668           73 :   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
   21669           73 :   vperm = force_reg (V8SImode, vperm);
   21670           73 :   rtx target = d->target;
   21671           73 :   rtx op0 = d->op0;
   21672           73 :   if (d->vmode == V8SImode)
   21673              :     {
   21674           21 :       target = lowpart_subreg (V8SFmode, target, V8SImode);
   21675           21 :       op0 = lowpart_subreg (V8SFmode, op0, V8SImode);
   21676              :     }
   21677              : 
   21678           73 :   emit_insn (gen_avx_vpermilvarv8sf3 (target, op0, vperm));
   21679              : 
   21680           73 :   return true;
   21681              : }
   21682              : 
   21683              : /* For V*[QHS]Imode permutations, check if the same permutation
   21684              :    can't be performed in a 2x, 4x or 8x wider inner mode.  */
   21685              : 
   21686              : static bool
   21687       160525 : canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
   21688              :                               struct expand_vec_perm_d *nd)
   21689              : {
   21690       160525 :   int i;
   21691       160525 :   machine_mode mode = VOIDmode;
   21692              : 
   21693       160525 :   switch (d->vmode)
   21694              :     {
   21695              :     case E_V8QImode: mode = V4HImode; break;
   21696        29410 :     case E_V16QImode: mode = V8HImode; break;
   21697         1420 :     case E_V32QImode: mode = V16HImode; break;
   21698          315 :     case E_V64QImode: mode = V32HImode; break;
   21699        11623 :     case E_V4HImode: mode = V2SImode; break;
   21700        20438 :     case E_V8HImode: mode = V4SImode; break;
   21701         1001 :     case E_V16HImode: mode = V8SImode; break;
   21702          397 :     case E_V32HImode: mode = V16SImode; break;
   21703        40595 :     case E_V4SImode: mode = V2DImode; break;
   21704         1491 :     case E_V8SImode: mode = V4DImode; break;
   21705           65 :     case E_V16SImode: mode = V8DImode; break;
   21706              :     default: return false;
   21707              :     }
   21708       201913 :   for (i = 0; i < d->nelt; i += 2)
   21709       187615 :     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
   21710              :       return false;
   21711        14298 :   nd->vmode = mode;
   21712        14298 :   nd->nelt = d->nelt / 2;
   21713        93526 :   for (i = 0; i < nd->nelt; i++)
   21714        79228 :     nd->perm[i] = d->perm[2 * i] / 2;
   21715        28596 :   if (GET_MODE_INNER (mode) != DImode)
   21716        12564 :     canonicalize_vector_int_perm (nd, nd);
   21717        14298 :   if (nd != d)
   21718              :     {
   21719         9053 :       nd->one_operand_p = d->one_operand_p;
   21720         9053 :       nd->testing_p = d->testing_p;
   21721         9053 :       if (d->op0 == d->op1)
   21722         3031 :         nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
   21723              :       else
   21724              :         {
   21725         6022 :           nd->op0 = gen_lowpart (nd->vmode, d->op0);
   21726         6022 :           nd->op1 = gen_lowpart (nd->vmode, d->op1);
   21727              :         }
   21728         9053 :       if (d->testing_p)
   21729         5790 :         nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
   21730              :       else
   21731         3263 :         nd->target = gen_reg_rtx (nd->vmode);
   21732              :     }
   21733              :   return true;
   21734              : }
   21735              : 
   21736              : /* Return true if permutation D can be performed as VMODE permutation
   21737              :    instead.  */
   21738              : 
   21739              : static bool
   21740         7580 : valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
   21741              : {
   21742         7580 :   unsigned int i, j, chunk;
   21743              : 
   21744         7580 :   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
   21745         7580 :       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
   21746        18636 :       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
   21747              :     return false;
   21748              : 
   21749        11056 :   if (GET_MODE_NUNITS (vmode) >= d->nelt)
   21750              :     return true;
   21751              : 
   21752         5236 :   chunk = d->nelt / GET_MODE_NUNITS (vmode);
   21753         7186 :   for (i = 0; i < d->nelt; i += chunk)
   21754         6939 :     if (d->perm[i] & (chunk - 1))
   21755              :       return false;
   21756              :     else
   21757        12694 :       for (j = 1; j < chunk; ++j)
   21758        10744 :         if (d->perm[i] + j != d->perm[i + j])
   21759              :           return false;
   21760              : 
   21761              :   return true;
   21762              : }
   21763              : 
   21764              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21765              :    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
   21766              : 
   21767              : static bool
   21768       136563 : expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   21769              : {
   21770       136563 :   unsigned i, nelt, eltsz, mask;
   21771       136563 :   unsigned char perm[64];
   21772       136563 :   machine_mode vmode;
   21773       136563 :   struct expand_vec_perm_d nd;
   21774       136563 :   rtx rperm[64], vperm, target, op0, op1;
   21775              : 
   21776       136563 :   nelt = d->nelt;
   21777              : 
   21778       136563 :   if (!d->one_operand_p)
   21779       221508 :     switch (GET_MODE_SIZE (d->vmode))
   21780              :       {
   21781         7810 :       case 4:
   21782         7810 :         if (!TARGET_XOP)
   21783              :           return false;
   21784              :         vmode = V4QImode;
   21785              :         break;
   21786              : 
   21787        18645 :       case 8:
   21788        18645 :         if (!TARGET_XOP)
   21789              :           return false;
   21790              :         vmode = V8QImode;
   21791              :         break;
   21792              : 
   21793        73660 :       case 16:
   21794        73660 :         if (!TARGET_XOP)
   21795              :           return false;
   21796              :         vmode = V16QImode;
   21797              :         break;
   21798              : 
   21799         9614 :       case 32:
   21800         9614 :         if (!TARGET_AVX2)
   21801              :           return false;
   21802              : 
   21803         4648 :         if (valid_perm_using_mode_p (V2TImode, d))
   21804              :           {
   21805           56 :             if (d->testing_p)
   21806              :               return true;
   21807              : 
   21808              :             /* Use vperm2i128 insn.  The pattern uses
   21809              :                V4DImode instead of V2TImode.  */
   21810           52 :             target = d->target;
   21811           52 :             if (d->vmode != V4DImode)
   21812           12 :               target = gen_reg_rtx (V4DImode);
   21813           52 :             op0 = gen_lowpart (V4DImode, d->op0);
   21814           52 :             op1 = gen_lowpart (V4DImode, d->op1);
   21815           52 :             rperm[0]
   21816           52 :               = GEN_INT ((d->perm[0] / (nelt / 2))
   21817              :                          | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
   21818           52 :             emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
   21819           52 :             if (target != d->target)
   21820           12 :               emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21821           52 :             return true;
   21822              :           }
   21823              :         /* FALLTHRU */
   21824              : 
   21825              :       default:
   21826              :         return false;
   21827              :       }
   21828              :   else
   21829        51618 :     switch (GET_MODE_SIZE (d->vmode))
   21830              :       {
   21831         3455 :       case 4:
   21832         3455 :         if (!TARGET_SSSE3)
   21833              :           return false;
   21834              :         vmode = V4QImode;
   21835              :         break;
   21836              : 
   21837         2398 :       case 8:
   21838         2398 :         if (!TARGET_SSSE3)
   21839              :           return false;
   21840              :         vmode = V8QImode;
   21841              :         break;
   21842              : 
   21843        14045 :       case 16:
   21844        14045 :         if (!TARGET_SSSE3)
   21845              :           return false;
   21846              :         vmode = V16QImode;
   21847              :         break;
   21848              : 
   21849         5522 :       case 32:
   21850         5522 :         if (!TARGET_AVX2)
   21851              :           return false;
   21852              : 
   21853              :         /* V4DImode should be already handled through
   21854              :            expand_vselect by vpermq instruction.  */
   21855         2663 :         gcc_assert (d->vmode != V4DImode);
   21856              : 
   21857         2663 :         vmode = V32QImode;
   21858         2663 :         if (d->vmode == V8SImode
   21859         2270 :             || d->vmode == V16HImode
   21860         2054 :             || d->vmode == V32QImode)
   21861              :           {
   21862              :             /* First see if vpermq can be used for
   21863              :                V8SImode/V16HImode/V32QImode.  */
   21864         1379 :             if (valid_perm_using_mode_p (V4DImode, d))
   21865              :               {
   21866          770 :                 for (i = 0; i < 4; i++)
   21867          616 :                   perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
   21868          154 :                 if (d->testing_p)
   21869              :                   return true;
   21870           58 :                 target = gen_reg_rtx (V4DImode);
   21871           58 :                 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
   21872              :                                     perm, 4, false))
   21873              :                   {
   21874          116 :                     emit_move_insn (d->target,
   21875           58 :                                     gen_lowpart (d->vmode, target));
   21876           58 :                     return true;
   21877              :                   }
   21878              :                 return false;
   21879              :               }
   21880              : 
   21881              :             /* Next see if vpermd can be used.  */
   21882         1225 :             if (valid_perm_using_mode_p (V8SImode, d))
   21883              :               vmode = V8SImode;
   21884              :           }
   21885              :         /* Or if vpermps can be used.  */
   21886         1284 :         else if (d->vmode == V8SFmode)
   21887              :           vmode = V8SImode;
   21888              : 
   21889              :         if (vmode == V32QImode)
   21890              :           {
   21891              :             /* vpshufb only works intra lanes, it is not
   21892              :                possible to shuffle bytes in between the lanes.  */
   21893        22049 :             for (i = 0; i < nelt; ++i)
   21894        21395 :               if ((d->perm[i] ^ i) & (nelt / 2))
   21895              :                 return false;
   21896              :           }
   21897              :         break;
   21898              : 
   21899          389 :       case 64:
   21900          389 :         if (!TARGET_AVX512BW)
   21901              :           return false;
   21902              : 
   21903              :         /* If vpermq didn't work, vpshufb won't work either.  */
   21904          204 :         if (d->vmode == V8DFmode || d->vmode == V8DImode)
   21905              :           return false;
   21906              : 
   21907          175 :         vmode = V64QImode;
   21908          175 :         if (d->vmode == V16SImode
   21909          150 :             || d->vmode == V32HImode
   21910           50 :             || d->vmode == V64QImode)
   21911              :           {
   21912              :             /* First see if vpermq can be used for
   21913              :                V16SImode/V32HImode/V64QImode.  */
   21914          164 :             if (valid_perm_using_mode_p (V8DImode, d))
   21915              :               {
   21916            0 :                 for (i = 0; i < 8; i++)
   21917            0 :                   perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
   21918            0 :                 if (d->testing_p)
   21919              :                   return true;
   21920            0 :                 target = gen_reg_rtx (V8DImode);
   21921            0 :                 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
   21922              :                                     perm, 8, false))
   21923              :                   {
   21924            0 :                     emit_move_insn (d->target,
   21925            0 :                                     gen_lowpart (d->vmode, target));
   21926            0 :                     return true;
   21927              :                   }
   21928              :                 return false;
   21929              :               }
   21930              : 
   21931              :             /* Next see if vpermd can be used.  */
   21932          164 :             if (valid_perm_using_mode_p (V16SImode, d))
   21933              :               vmode = V16SImode;
   21934              :           }
   21935              :         /* Or if vpermps can be used.  */
   21936           11 :         else if (d->vmode == V16SFmode)
   21937              :           vmode = V16SImode;
   21938              : 
   21939              :         if (vmode == V64QImode)
   21940              :           {
   21941              :             /* vpshufb only works intra lanes, it is not
   21942              :                possible to shuffle bytes in between the lanes.  */
   21943          578 :             for (i = 0; i < nelt; ++i)
   21944          578 :               if ((d->perm[i] ^ i) & (3 * nelt / 4))
   21945              :                 return false;
   21946              :           }
   21947              :         break;
   21948              : 
   21949              :       default:
   21950              :         return false;
   21951              :       }
   21952              : 
   21953        12232 :   if (d->testing_p)
   21954              :     return true;
   21955              : 
   21956              :   /* Try to avoid variable permutation instruction.  */
   21957         9327 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   21958              :     {
   21959         1839 :       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   21960         1839 :       return true;
   21961              :     }
   21962              : 
   21963         7488 :   if (vmode == V8SImode)
   21964         9639 :     for (i = 0; i < 8; ++i)
   21965         8568 :       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
   21966         6417 :   else if (vmode == V16SImode)
   21967          612 :     for (i = 0; i < 16; ++i)
   21968          576 :       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
   21969              :   else
   21970              :     {
   21971         6381 :       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   21972         6381 :       if (!d->one_operand_p)
   21973         3210 :         mask = 2 * nelt - 1;
   21974         3171 :       else if (vmode == V64QImode)
   21975            0 :         mask = nelt / 4 - 1;
   21976         3171 :       else if (vmode == V32QImode)
   21977          648 :         mask = nelt / 2 - 1;
   21978              :       else
   21979         2523 :         mask = nelt - 1;
   21980              : 
   21981        74529 :       for (i = 0; i < nelt; ++i)
   21982              :         {
   21983        68148 :           unsigned j, e = d->perm[i] & mask;
   21984       178264 :           for (j = 0; j < eltsz; ++j)
   21985       110116 :             rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
   21986              :         }
   21987              :     }
   21988              : 
   21989         7488 :   machine_mode vpmode = vmode;
   21990              : 
   21991         7488 :   nelt = GET_MODE_SIZE (vmode);
   21992              : 
   21993              :   /* Emulate narrow modes with V16QI instructions.  */
   21994         7488 :   if (nelt < 16)
   21995              :     {
   21996          222 :       rtx m128 = GEN_INT (-128);
   21997              : 
   21998              :       /* Remap elements from the second operand, as we have to
   21999              :          account for inactive top elements from the first operand.  */
   22000          222 :       if (!d->one_operand_p)
   22001              :         {
   22002          243 :           for (i = 0; i < nelt; ++i)
   22003              :             {
   22004          216 :               unsigned ival = UINTVAL (rperm[i]);
   22005          216 :               if (ival >= nelt)
   22006          108 :                 rperm[i] = GEN_INT (ival + 16 - nelt);
   22007              :             }
   22008              :         }
   22009              : 
   22010              :       /* Fill inactive elements in the top positions with zeros.  */
   22011         2570 :       for (i = nelt; i < 16; ++i)
   22012         2348 :         rperm[i] = m128;
   22013              : 
   22014              :       vpmode = V16QImode;
   22015              :     }
   22016              : 
   22017        14976 :   vperm = gen_rtx_CONST_VECTOR (vpmode,
   22018         7488 :                                 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
   22019         7488 :   vperm = force_reg (vpmode, vperm);
   22020              : 
   22021         7488 :   if (vmode == d->vmode)
   22022         2893 :     target = d->target;
   22023              :   else
   22024         4595 :     target = gen_reg_rtx (vmode);
   22025              : 
   22026         7488 :   op0 = gen_lowpart (vmode, d->op0);
   22027              : 
   22028         7488 :   if (d->one_operand_p)
   22029              :     {
   22030         4278 :       rtx (*gen) (rtx, rtx, rtx);
   22031              : 
   22032         4278 :       if (vmode == V4QImode)
   22033              :         gen = gen_mmx_pshufbv4qi3;
   22034              :       else if (vmode == V8QImode)
   22035              :         gen = gen_mmx_pshufbv8qi3;
   22036              :       else if (vmode == V16QImode)
   22037              :         gen = gen_ssse3_pshufbv16qi3;
   22038              :       else if (vmode == V32QImode)
   22039              :         gen = gen_avx2_pshufbv32qi3;
   22040              :       else if (vmode == V64QImode)
   22041              :         gen = gen_avx512bw_pshufbv64qi3;
   22042              :       else if (vmode == V8SFmode)
   22043              :         gen = gen_avx2_permvarv8sf;
   22044              :       else if (vmode == V8SImode)
   22045              :         gen = gen_avx2_permvarv8si;
   22046              :       else if (vmode == V16SFmode)
   22047              :         gen = gen_avx512f_permvarv16sf;
   22048              :       else if (vmode == V16SImode)
   22049              :         gen = gen_avx512f_permvarv16si;
   22050              :       else
   22051              :         gcc_unreachable ();
   22052              : 
   22053         4278 :       emit_insn (gen (target, op0, vperm));
   22054              :     }
   22055              :   else
   22056              :     {
   22057         3210 :       rtx (*gen) (rtx, rtx, rtx, rtx);
   22058              : 
   22059         3210 :       op1 = gen_lowpart (vmode, d->op1);
   22060              : 
   22061         3210 :       if (vmode == V4QImode)
   22062              :         gen = gen_mmx_ppermv32;
   22063              :       else if (vmode == V8QImode)
   22064              :         gen = gen_mmx_ppermv64;
   22065              :       else if (vmode == V16QImode)
   22066              :         gen = gen_xop_pperm;
   22067              :       else
   22068            0 :         gcc_unreachable ();
   22069              : 
   22070         3210 :       emit_insn (gen (target, op0, op1, vperm));
   22071              :     }
   22072              : 
   22073         7488 :   if (target != d->target)
   22074         4595 :     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   22075              : 
   22076              :   return true;
   22077              : }
   22078              : 
   22079              : /* Try to expand one-operand permutation with constant mask.  */
   22080              : 
   22081              : static bool
   22082       124001 : ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
   22083              : {
   22084       124001 :   machine_mode mode = GET_MODE (d->op0);
   22085       124001 :   machine_mode maskmode = mode;
   22086       248002 :   unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
   22087       124001 :   rtx (*gen) (rtx, rtx, rtx) = NULL;
   22088       124001 :   rtx target, op0, mask;
   22089       124001 :   rtx vec[64];
   22090              : 
   22091       124001 :   if (!rtx_equal_p (d->op0, d->op1))
   22092              :     return false;
   22093              : 
   22094        17766 :   if (!TARGET_AVX512F)
   22095              :     return false;
   22096              : 
   22097              :   /* Accept VNxHImode and VNxQImode now.  */
   22098          719 :   if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
   22099              :     return false;
   22100              : 
   22101              :   /* vpermw.  */
   22102          457 :   if (!TARGET_AVX512BW && inner_size == 2)
   22103              :     return false;
   22104              : 
   22105              :   /* vpermb.  */
   22106          323 :   if (!TARGET_AVX512VBMI && inner_size == 1)
   22107              :     return false;
   22108              : 
   22109          202 :   switch (mode)
   22110              :     {
   22111              :     case E_V16SImode:
   22112              :       gen = gen_avx512f_permvarv16si;
   22113              :       break;
   22114            4 :     case E_V16SFmode:
   22115            4 :       gen = gen_avx512f_permvarv16sf;
   22116            4 :       maskmode = V16SImode;
   22117            4 :       break;
   22118            1 :     case E_V8DImode:
   22119            1 :       gen = gen_avx512f_permvarv8di;
   22120            1 :       break;
   22121           30 :     case E_V8DFmode:
   22122           30 :       gen = gen_avx512f_permvarv8df;
   22123           30 :       maskmode = V8DImode;
   22124           30 :       break;
   22125          108 :     case E_V32HImode:
   22126          108 :       gen = gen_avx512bw_permvarv32hi;
   22127          108 :       break;
   22128           14 :     case E_V16HImode:
   22129           14 :       gen = gen_avx512vl_permvarv16hi;
   22130           14 :       break;
   22131            6 :     case E_V8HImode:
   22132            6 :       gen = gen_avx512vl_permvarv8hi;
   22133            6 :       break;
   22134            4 :     case E_V64QImode:
   22135            4 :       gen = gen_avx512bw_permvarv64qi;
   22136            4 :       break;
   22137            2 :     case E_V32QImode:
   22138            2 :       gen = gen_avx512vl_permvarv32qi;
   22139            2 :       break;
   22140            0 :     case E_V16QImode:
   22141            0 :       gen = gen_avx512vl_permvarv16qi;
   22142            0 :       break;
   22143              : 
   22144              :     default:
   22145              :       return false;
   22146              :     }
   22147              : 
   22148          201 :   if (d->testing_p)
   22149              :     return true;
   22150              : 
   22151          192 :   target = d->target;
   22152          192 :   op0 = d->op0;
   22153         4920 :   for (int i = 0; i < d->nelt; ++i)
   22154         4728 :     vec[i] = GEN_INT (d->perm[i]);
   22155          192 :   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
   22156          192 :   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
   22157          192 :   return true;
   22158              : }
   22159              : 
   22160              : static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
   22161              : 
   22162              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
   22163              :    in a single instruction.  */
   22164              : 
   22165              : static bool
   22166       355334 : expand_vec_perm_1 (struct expand_vec_perm_d *d)
   22167              : {
   22168       355334 :   unsigned i, nelt = d->nelt;
   22169       355334 :   struct expand_vec_perm_d nd;
   22170              : 
   22171              :   /* Check plain VEC_SELECT first, because AVX has instructions that could
   22172              :      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
   22173              :      input where SEL+CONCAT may not.  */
   22174       355334 :   if (d->one_operand_p)
   22175              :     {
   22176              :       int mask = nelt - 1;
   22177              :       bool identity_perm = true;
   22178              :       bool broadcast_perm = true;
   22179              : 
   22180       529502 :       for (i = 0; i < nelt; i++)
   22181              :         {
   22182       466142 :           nd.perm[i] = d->perm[i] & mask;
   22183       466142 :           if (nd.perm[i] != i)
   22184       349773 :             identity_perm = false;
   22185       466142 :           if (nd.perm[i])
   22186       386944 :             broadcast_perm = false;
   22187              :         }
   22188              : 
   22189        63360 :       if (identity_perm)
   22190              :         {
   22191           59 :           if (!d->testing_p)
   22192            5 :             emit_move_insn (d->target, d->op0);
   22193           59 :           return true;
   22194              :         }
   22195        63301 :       else if (broadcast_perm && TARGET_AVX2)
   22196              :         {
   22197              :           /* Use vpbroadcast{b,w,d}.  */
   22198          390 :           rtx (*gen) (rtx, rtx) = NULL;
   22199          390 :           switch (d->vmode)
   22200              :             {
   22201            1 :             case E_V64QImode:
   22202            1 :               if (TARGET_AVX512BW)
   22203              :                 gen = gen_avx512bw_vec_dupv64qi_1;
   22204              :               break;
   22205            4 :             case E_V32QImode:
   22206            4 :               gen = gen_avx2_pbroadcastv32qi_1;
   22207            4 :               break;
   22208            1 :             case E_V32HImode:
   22209            1 :               if (TARGET_AVX512BW)
   22210              :                 gen = gen_avx512bw_vec_dupv32hi_1;
   22211              :               break;
   22212            4 :             case E_V16HImode:
   22213            4 :               gen = gen_avx2_pbroadcastv16hi_1;
   22214            4 :               break;
   22215            1 :             case E_V16SImode:
   22216            1 :               if (TARGET_AVX512F)
   22217              :                 gen = gen_avx512f_vec_dupv16si_1;
   22218              :               break;
   22219            4 :             case E_V8SImode:
   22220            4 :               gen = gen_avx2_pbroadcastv8si_1;
   22221            4 :               break;
   22222            4 :             case E_V16QImode:
   22223            4 :               gen = gen_avx2_pbroadcastv16qi;
   22224            4 :               break;
   22225            5 :             case E_V8HImode:
   22226            5 :               gen = gen_avx2_pbroadcastv8hi;
   22227            5 :               break;
   22228            0 :             case E_V16SFmode:
   22229            0 :               if (TARGET_AVX512F)
   22230              :                 gen = gen_avx512f_vec_dupv16sf_1;
   22231              :               break;
   22232              :             case E_V8SFmode:
   22233              :               gen = gen_avx2_vec_dupv8sf_1;
   22234              :               break;
   22235            0 :             case E_V8DFmode:
   22236            0 :               if (TARGET_AVX512F)
   22237              :                 gen = gen_avx512f_vec_dupv8df_1;
   22238              :               break;
   22239            0 :             case E_V8DImode:
   22240            0 :               if (TARGET_AVX512F)
   22241              :                 gen = gen_avx512f_vec_dupv8di_1;
   22242              :               break;
   22243              :             /* For other modes prefer other shuffles this function creates.  */
   22244              :             default: break;
   22245              :             }
   22246           21 :           if (gen != NULL)
   22247              :             {
   22248           24 :               if (!d->testing_p)
   22249           24 :                 emit_insn (gen (d->target, d->op0));
   22250           24 :               return true;
   22251              :             }
   22252              :         }
   22253              : 
   22254        63277 :       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
   22255              :         return true;
   22256              : 
   22257              :       /* There are plenty of patterns in sse.md that are written for
   22258              :          SEL+CONCAT and are not replicated for a single op.  Perhaps
   22259              :          that should be changed, to avoid the nastiness here.  */
   22260              : 
   22261              :       /* Recognize interleave style patterns, which means incrementing
   22262              :          every other permutation operand.  */
   22263       211028 :       for (i = 0; i < nelt; i += 2)
   22264              :         {
   22265       173616 :           nd.perm[i] = d->perm[i] & mask;
   22266       173616 :           nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
   22267              :         }
   22268        37412 :       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
   22269        37412 :                                   d->testing_p))
   22270              :         return true;
   22271              : 
   22272              :       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
   22273        32528 :       if (nelt >= 4)
   22274              :         {
   22275       113548 :           for (i = 0; i < nelt; i += 4)
   22276              :             {
   22277        81020 :               nd.perm[i + 0] = d->perm[i + 0] & mask;
   22278        81020 :               nd.perm[i + 1] = d->perm[i + 1] & mask;
   22279        81020 :               nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
   22280        81020 :               nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
   22281              :             }
   22282              : 
   22283        32528 :           if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
   22284        32528 :                                       d->testing_p))
   22285              :             return true;
   22286              :         }
   22287              :     }
   22288              : 
   22289              :   /* Try the SSE4.1 blend variable merge instructions.  */
   22290       318829 :   if (expand_vec_perm_blend (d))
   22291              :     return true;
   22292              : 
   22293              :   /* Try movss/movsd instructions.  */
   22294       317242 :   if (expand_vec_perm_movs (d))
   22295              :     return true;
   22296              : 
   22297              :   /* Try the SSE4.1 insertps instruction.  */
   22298       282105 :   if (expand_vec_perm_insertps (d))
   22299              :     return true;
   22300              : 
   22301              :   /* Try the fully general two operand permute.  */
   22302       277925 :   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
   22303       277925 :                               d->testing_p))
   22304              :     return true;
   22305              : 
   22306              :   /* Recognize interleave style patterns with reversed operands.  */
   22307       137513 :   if (!d->one_operand_p)
   22308              :     {
   22309       900030 :       for (i = 0; i < nelt; ++i)
   22310              :         {
   22311       789264 :           unsigned e = d->perm[i];
   22312       789264 :           if (e >= nelt)
   22313       386356 :             e -= nelt;
   22314              :           else
   22315       402908 :             e += nelt;
   22316       789264 :           nd.perm[i] = e;
   22317              :         }
   22318              : 
   22319       110766 :       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
   22320       110766 :                                   d->testing_p))
   22321              :         return true;
   22322              :     }
   22323              : 
   22324              :   /* Try one of the AVX vpermil variable permutations.  */
   22325       137501 :   if (expand_vec_perm_vpermil (d))
   22326              :     return true;
   22327              : 
   22328              :   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
   22329              :      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
   22330       136563 :   if (expand_vec_perm_pshufb (d))
   22331              :     return true;
   22332              : 
   22333              :   /* Try the AVX2 vpalignr instruction.  */
   22334       124121 :   if (expand_vec_perm_palignr (d, true))
   22335              :     return true;
   22336              : 
   22337              :   /* Try the AVX512F vperm{w,b,s,d} instructions  */
   22338       124001 :   if (ix86_expand_vec_one_operand_perm_avx512 (d))
   22339              :     return true;
   22340              : 
   22341              :   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
   22342       123800 :   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
   22343              :     return true;
   22344              : 
   22345              :   /* See if we can get the same permutation in different vector integer
   22346              :      mode.  */
   22347       122844 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   22348              :     {
   22349         6616 :       if (!d->testing_p)
   22350         1207 :         emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   22351         6616 :       return true;
   22352              :     }
   22353              :   return false;
   22354              : }
   22355              : 
   22356              : /* Canonicalize vec_perm index to make the first index
   22357              :    always comes from the first vector.  */
   22358              : static void
   22359         8189 : ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
   22360              : {
   22361         8189 :   unsigned nelt = d->nelt;
   22362         8189 :   if (d->perm[0] < nelt)
   22363              :     return;
   22364              : 
   22365            5 :   for (unsigned i = 0; i != nelt; i++)
   22366            4 :     d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
   22367              : 
   22368            1 :   std::swap (d->op0, d->op1);
   22369            1 :   return;
   22370              : }
   22371              : 
   22372              : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
   22373              :    in terms of a pair of shufps+ shufps/pshufd instructions.  */
   22374              : static bool
   22375        83619 : expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
   22376              : {
   22377        83619 :   unsigned char perm1[4];
   22378        83619 :   machine_mode vmode = d->vmode;
   22379        83619 :   bool ok;
   22380        83619 :   unsigned i, j, k, count = 0;
   22381              : 
   22382        83619 :   if (d->one_operand_p
   22383        78339 :       || (vmode != V4SImode && vmode != V4SFmode))
   22384              :     return false;
   22385              : 
   22386        34762 :   if (d->testing_p)
   22387              :     return true;
   22388              : 
   22389         8189 :   ix86_vec_perm_index_canon (d);
   22390        49134 :   for (i = 0; i < 4; ++i)
   22391        51259 :     count += d->perm[i] > 3 ? 1 : 0;
   22392              : 
   22393         8189 :   gcc_assert (count & 3);
   22394              : 
   22395         8189 :   rtx tmp = gen_reg_rtx (vmode);
   22396              :   /* 2 from op0 and 2 from op1.  */
   22397         8189 :   if (count == 2)
   22398              :     {
   22399              :       unsigned char perm2[4];
   22400        18240 :       for (i = 0, j = 0, k = 2; i < 4; ++i)
   22401        14592 :         if (d->perm[i] & 4)
   22402              :           {
   22403         7296 :             perm1[k++] = d->perm[i];
   22404         7296 :             perm2[i] = k - 1;
   22405              :           }
   22406              :         else
   22407              :           {
   22408         7296 :             perm1[j++] = d->perm[i];
   22409         7296 :             perm2[i] = j - 1;
   22410              :           }
   22411              : 
   22412              :       /* shufps.  */
   22413         7296 :       ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
   22414         3648 :                                   perm1, d->nelt, false);
   22415         3648 :       gcc_assert (ok);
   22416         3648 :       if (vmode == V4SImode && TARGET_SSE2)
   22417              :       /* pshufd.  */
   22418         2092 :         ok = expand_vselect (d->target, tmp,
   22419         2092 :                              perm2, d->nelt, false);
   22420              :       else
   22421              :         {
   22422              :           /* shufps.  */
   22423         1556 :           perm2[2] += 4;
   22424         1556 :           perm2[3] += 4;
   22425         1556 :           ok = expand_vselect_vconcat (d->target, tmp, tmp,
   22426         1556 :                                        perm2, d->nelt, false);
   22427              :         }
   22428         3648 :       gcc_assert (ok);
   22429              :     }
   22430              :   /* 3 from one op and 1 from another.  */
   22431              :   else
   22432              :     {
   22433        22705 :       unsigned pair_idx = 8, lone_idx = 8, shift;
   22434              : 
   22435              :       /* Find the lone index.  */
   22436        22705 :       for (i = 0; i < 4; ++i)
   22437        18164 :         if ((d->perm[i] > 3 && count == 1)
   22438        14831 :             || (d->perm[i] < 4 && count == 3))
   22439        18164 :           lone_idx = i;
   22440              : 
   22441              :       /* When lone_idx is not 0, it must from second op(count == 1).  */
   22442         5749 :       gcc_assert (count == (lone_idx ? 1 : 3));
   22443              : 
   22444              :       /* Find the pair index that sits in the same half as the lone index.  */
   22445         4541 :       shift = lone_idx & 2;
   22446         4541 :       pair_idx = 1 - lone_idx + 2 * shift;
   22447              : 
   22448              :       /* First permutate lone index and pair index into the same vector as
   22449              :          [ lone, lone, pair, pair ].  */
   22450         9082 :       perm1[1] = perm1[0]
   22451         4541 :         = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
   22452         9082 :       perm1[3] = perm1[2]
   22453         4541 :         = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
   22454              : 
   22455              :       /* Alway put the vector contains lone indx at the first.  */
   22456         4541 :       if (count == 1)
   22457         3333 :         std::swap (d->op0, d->op1);
   22458              : 
   22459              :       /* shufps.  */
   22460         9082 :       ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
   22461         4541 :                                    perm1, d->nelt, false);
   22462         4541 :       gcc_assert (ok);
   22463              : 
   22464              :       /* Refine lone and pair index to original order.  */
   22465         4541 :       perm1[shift] = lone_idx << 1;
   22466         4541 :       perm1[shift + 1] = pair_idx << 1;
   22467              : 
   22468              :       /* Select the remaining 2 elements in another vector.  */
   22469        13623 :       for (i = 2 - shift; i < 4 - shift; ++i)
   22470         9082 :         perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
   22471              : 
   22472              :       /* Adjust to original selector.  */
   22473         4541 :       if (lone_idx > 1)
   22474         2246 :         std::swap (tmp, d->op1);
   22475              : 
   22476              :       /* shufps.  */
   22477         9082 :       ok = expand_vselect_vconcat (d->target, tmp, d->op1,
   22478         4541 :                                    perm1, d->nelt, false);
   22479              : 
   22480         4541 :       gcc_assert (ok);
   22481              :     }
   22482              : 
   22483              :   return true;
   22484              : }
   22485              : 
   22486              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   22487              :    in terms of a pair of pshuflw + pshufhw instructions.  */
   22488              : 
   22489              : static bool
   22490       100982 : expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
   22491              : {
   22492       100982 :   unsigned char perm2[MAX_VECT_LEN];
   22493       100982 :   unsigned i;
   22494       100982 :   bool ok;
   22495              : 
   22496       100982 :   if (d->vmode != V8HImode || !d->one_operand_p)
   22497              :     return false;
   22498              : 
   22499              :   /* The two permutations only operate in 64-bit lanes.  */
   22500        12859 :   for (i = 0; i < 4; ++i)
   22501        10382 :     if (d->perm[i] >= 4)
   22502              :       return false;
   22503        12329 :   for (i = 4; i < 8; ++i)
   22504         9866 :     if (d->perm[i] < 4)
   22505              :       return false;
   22506              : 
   22507         2463 :   if (d->testing_p)
   22508              :     return true;
   22509              : 
   22510              :   /* Emit the pshuflw.  */
   22511          134 :   memcpy (perm2, d->perm, 4);
   22512          670 :   for (i = 4; i < 8; ++i)
   22513          536 :     perm2[i] = i;
   22514          134 :   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
   22515          134 :   gcc_assert (ok);
   22516              : 
   22517              :   /* Emit the pshufhw.  */
   22518          134 :   memcpy (perm2 + 4, d->perm + 4, 4);
   22519          670 :   for (i = 0; i < 4; ++i)
   22520          536 :     perm2[i] = i;
   22521          134 :   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
   22522          134 :   gcc_assert (ok);
   22523              : 
   22524              :   return true;
   22525              : }
   22526              : 
   22527              : /* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle.  */
   22528              : static bool
   22529        48857 : expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
   22530              : {
   22531        48857 :   if (GET_MODE_BITSIZE (d->vmode) != 64
   22532        15098 :       || !TARGET_MMX_WITH_SSE
   22533        63955 :       || d->one_operand_p)
   22534              :     return false;
   22535              : 
   22536        13703 :   machine_mode widen_vmode;
   22537        13703 :   switch (d->vmode)
   22538              :     {
   22539              :     /* pshufd.  */
   22540              :     case E_V2SImode:
   22541              :       widen_vmode = V4SImode;
   22542              :       break;
   22543              : 
   22544              :     /* pshufd.  */
   22545         1101 :     case E_V2SFmode:
   22546         1101 :       widen_vmode = V4SFmode;
   22547         1101 :       break;
   22548              : 
   22549         4663 :     case E_V4HImode:
   22550         4663 :       widen_vmode = V8HImode;
   22551              :       /* pshufb.  */
   22552         4663 :       if (!TARGET_SSSE3)
   22553              :         return false;
   22554              :       break;
   22555              : 
   22556         5562 :     case E_V8QImode:
   22557              :       /* pshufb.  */
   22558         5562 :       widen_vmode = V16QImode;
   22559         5562 :       if (!TARGET_SSSE3)
   22560              :         return false;
   22561              :       break;
   22562              : 
   22563              :     default:
   22564              :       return false;
   22565              :     }
   22566              : 
   22567         5274 :   if (d->testing_p)
   22568              :     return true;
   22569              : 
   22570          379 :   struct expand_vec_perm_d dperm;
   22571          379 :   dperm.target = gen_reg_rtx (widen_vmode);
   22572          379 :   rtx op0 = gen_reg_rtx (widen_vmode);
   22573          379 :   emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
   22574          379 :   dperm.op0 = op0;
   22575          379 :   dperm.op1 = op0;
   22576          379 :   dperm.vmode = widen_vmode;
   22577          379 :   unsigned nelt = GET_MODE_NUNITS (widen_vmode);
   22578          379 :   dperm.nelt = nelt;
   22579          379 :   dperm.one_operand_p = true;
   22580          379 :   dperm.testing_p = false;
   22581              : 
   22582         2009 :   for (unsigned i = 0; i != nelt / 2; i++)
   22583              :     {
   22584         1630 :       dperm.perm[i] = d->perm[i];
   22585         1630 :       dperm.perm[i + nelt / 2] = d->perm[i];
   22586              :     }
   22587              : 
   22588          379 :   gcc_assert (expand_vec_perm_1 (&dperm));
   22589          379 :   emit_move_insn (d->target, lowpart_subreg (d->vmode,
   22590              :                                              dperm.target,
   22591              :                                              dperm.vmode));
   22592          379 :   return true;
   22593              : }
   22594              : 
   22595              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22596              :    the permutation using the SSSE3 palignr instruction.  This succeeds
   22597              :    when all of the elements in PERM fit within one vector and we merely
   22598              :    need to shift them down so that a single vector permutation has a
   22599              :    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
   22600              :    the vpalignr instruction itself can perform the requested permutation.  */
   22601              : 
   22602              : static bool
   22603       222640 : expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
   22604              : {
   22605       222640 :   unsigned i, nelt = d->nelt;
   22606       222640 :   unsigned min, max, minswap, maxswap;
   22607       222640 :   bool in_order, ok, swap = false;
   22608       222640 :   rtx shift, target;
   22609       222640 :   struct expand_vec_perm_d dcopy;
   22610              : 
   22611              :   /* Even with AVX, palignr only operates on 128-bit vectors,
   22612              :      in AVX2 palignr operates on both 128-bit lanes.  */
   22613       120848 :   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
   22614       267468 :       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
   22615              :     return false;
   22616              : 
   22617        35543 :   min = 2 * nelt;
   22618        35543 :   max = 0;
   22619        35543 :   minswap = 2 * nelt;
   22620        35543 :   maxswap = 0;
   22621       259515 :   for (i = 0; i < nelt; ++i)
   22622              :     {
   22623       223972 :       unsigned e = d->perm[i];
   22624       223972 :       unsigned eswap = d->perm[i] ^ nelt;
   22625       447944 :       if (GET_MODE_SIZE (d->vmode) == 32)
   22626              :         {
   22627        89592 :           e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
   22628        89592 :           eswap = e ^ (nelt / 2);
   22629              :         }
   22630       223972 :       if (e < min)
   22631              :         min = e;
   22632       223972 :       if (e > max)
   22633              :         max = e;
   22634       223972 :       if (eswap < minswap)
   22635              :         minswap = eswap;
   22636       223972 :       if (eswap > maxswap)
   22637              :         maxswap = eswap;
   22638              :     }
   22639        35543 :   if (min == 0
   22640        51597 :       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
   22641              :     {
   22642        32347 :       if (d->one_operand_p
   22643        32078 :           || minswap == 0
   22644        68519 :           || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
   22645        18086 :                                    ? nelt / 2 : nelt))
   22646              :         return false;
   22647              :       swap = true;
   22648              :       min = minswap;
   22649         6416 :       max = maxswap;
   22650              :     }
   22651              : 
   22652              :   /* Given that we have SSSE3, we know we'll be able to implement the
   22653              :      single operand permutation after the palignr with pshufb for
   22654              :      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
   22655              :      first.  */
   22656         6466 :   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
   22657              :     return true;
   22658              : 
   22659         6416 :   dcopy = *d;
   22660         6416 :   if (swap)
   22661              :     {
   22662         3220 :       dcopy.op0 = d->op1;
   22663         3220 :       dcopy.op1 = d->op0;
   22664        16172 :       for (i = 0; i < nelt; ++i)
   22665        12952 :         dcopy.perm[i] ^= nelt;
   22666              :     }
   22667              : 
   22668              :   in_order = true;
   22669        32632 :   for (i = 0; i < nelt; ++i)
   22670              :     {
   22671        26216 :       unsigned e = dcopy.perm[i];
   22672        26216 :       if (GET_MODE_SIZE (d->vmode) == 32
   22673         1120 :           && e >= nelt
   22674        26466 :           && (e & (nelt / 2 - 1)) < min)
   22675          250 :         e = e - min - (nelt / 2);
   22676              :       else
   22677        25966 :         e = e - min;
   22678        26216 :       if (e != i)
   22679        19394 :         in_order = false;
   22680        26216 :       dcopy.perm[i] = e;
   22681              :     }
   22682         6416 :   dcopy.one_operand_p = true;
   22683              : 
   22684         6416 :   if (single_insn_only_p && !in_order)
   22685              :     return false;
   22686              : 
   22687              :   /* For AVX2, test whether we can permute the result in one instruction.  */
   22688         3267 :   if (d->testing_p)
   22689              :     {
   22690           50 :       if (in_order)
   22691              :         return true;
   22692            0 :       dcopy.op1 = dcopy.op0;
   22693            0 :       return expand_vec_perm_1 (&dcopy);
   22694              :     }
   22695              : 
   22696         6434 :   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
   22697         6434 :   if (GET_MODE_SIZE (d->vmode) == 16)
   22698              :     {
   22699         3145 :       target = gen_reg_rtx (V1TImode);
   22700         3145 :       emit_insn (gen_ssse3_palignrv1ti (target,
   22701         3145 :                                         gen_lowpart (V1TImode, dcopy.op1),
   22702         3145 :                                         gen_lowpart (V1TImode, dcopy.op0),
   22703              :                                         shift));
   22704              :     }
   22705              :   else
   22706              :     {
   22707           72 :       target = gen_reg_rtx (V2TImode);
   22708           72 :       emit_insn (gen_avx2_palignrv2ti (target,
   22709           72 :                                        gen_lowpart (V2TImode, dcopy.op1),
   22710           72 :                                        gen_lowpart (V2TImode, dcopy.op0),
   22711              :                                        shift));
   22712              :     }
   22713              : 
   22714         3217 :   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   22715              : 
   22716              :   /* Test for the degenerate case where the alignment by itself
   22717              :      produces the desired permutation.  */
   22718         3217 :   if (in_order)
   22719              :     {
   22720           70 :       emit_move_insn (d->target, dcopy.op0);
   22721           70 :       return true;
   22722              :     }
   22723              : 
   22724         3147 :   ok = expand_vec_perm_1 (&dcopy);
   22725         3159 :   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
   22726              : 
   22727              :   return ok;
   22728              : }
   22729              : 
   22730              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22731              :    the permutation using the SSE4_1 pblendv instruction.  Potentially
   22732              :    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
   22733              : 
   22734              : static bool
   22735        88832 : expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
   22736              : {
   22737        88832 :   unsigned i, which, nelt = d->nelt;
   22738        88832 :   struct expand_vec_perm_d dcopy, dcopy1;
   22739        88832 :   machine_mode vmode = d->vmode;
   22740        88832 :   bool ok;
   22741              : 
   22742              :   /* Use the same checks as in expand_vec_perm_blend.  */
   22743        88832 :   if (d->one_operand_p)
   22744              :     return false;
   22745        87603 :   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   22746              :     ;
   22747        81373 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   22748              :     ;
   22749        77602 :   else if (TARGET_SSE4_1
   22750        87783 :            && (GET_MODE_SIZE (vmode) == 16
   22751         8484 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   22752         2559 :                || GET_MODE_SIZE (vmode) == 4))
   22753              :     ;
   22754              :   else
   22755              :     return false;
   22756              : 
   22757              :   /* Figure out where permutation elements stay not in their
   22758              :      respective lanes.  */
   22759       119408 :   for (i = 0, which = 0; i < nelt; ++i)
   22760              :     {
   22761       103648 :       unsigned e = d->perm[i];
   22762       103648 :       if (e != i)
   22763       141880 :         which |= (e < nelt ? 1 : 2);
   22764              :     }
   22765              :   /* We can pblend the part where elements stay not in their
   22766              :      respective lanes only when these elements are all in one
   22767              :      half of a permutation.
   22768              :      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
   22769              :      lanes, but both 8 and 9 >= 8
   22770              :      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
   22771              :      respective lanes and 8 >= 8, but 2 not.  */
   22772        15760 :   if (which != 1 && which != 2)
   22773              :     return false;
   22774         3361 :   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
   22775              :     return true;
   22776              : 
   22777              :   /* First we apply one operand permutation to the part where
   22778              :      elements stay not in their respective lanes.  */
   22779         2051 :   dcopy = *d;
   22780         2051 :   if (which == 2)
   22781         2051 :     dcopy.op0 = dcopy.op1 = d->op1;
   22782              :   else
   22783            0 :     dcopy.op0 = dcopy.op1 = d->op0;
   22784         2051 :   if (!d->testing_p)
   22785          741 :     dcopy.target = gen_reg_rtx (vmode);
   22786         2051 :   dcopy.one_operand_p = true;
   22787              : 
   22788        16603 :   for (i = 0; i < nelt; ++i)
   22789        14552 :     dcopy.perm[i] = d->perm[i] & (nelt - 1);
   22790              : 
   22791         2051 :   ok = expand_vec_perm_1 (&dcopy);
   22792         4102 :   if (GET_MODE_SIZE (vmode) != 16 && !ok)
   22793              :     return false;
   22794              :   else
   22795         1756 :     gcc_assert (ok);
   22796         1756 :   if (d->testing_p)
   22797              :     return true;
   22798              : 
   22799              :   /* Next we put permuted elements into their positions.  */
   22800          679 :   dcopy1 = *d;
   22801          679 :   if (which == 2)
   22802          679 :     dcopy1.op1 = dcopy.target;
   22803              :   else
   22804            0 :     dcopy1.op0 = dcopy.target;
   22805              : 
   22806         5751 :   for (i = 0; i < nelt; ++i)
   22807         5072 :     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
   22808              : 
   22809          679 :   ok = expand_vec_perm_blend (&dcopy1);
   22810          679 :   gcc_assert (ok);
   22811              : 
   22812              :   return true;
   22813              : }
   22814              : 
   22815              : static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
   22816              : 
   22817              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22818              :    a two vector permutation into a single vector permutation by using
   22819              :    an interleave operation to merge the vectors.  */
   22820              : 
   22821              : static bool
   22822        95384 : expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
   22823              : {
   22824        95384 :   struct expand_vec_perm_d dremap, dfinal;
   22825        95384 :   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
   22826        95384 :   unsigned HOST_WIDE_INT contents;
   22827        95384 :   unsigned char remap[2 * MAX_VECT_LEN];
   22828        95384 :   rtx_insn *seq;
   22829        95384 :   bool ok, same_halves = false;
   22830              : 
   22831        95384 :   if (GET_MODE_SIZE (d->vmode) == 4
   22832       171866 :       || GET_MODE_SIZE (d->vmode) == 8
   22833       233334 :       || GET_MODE_SIZE (d->vmode) == 16)
   22834              :     {
   22835        87983 :       if (d->one_operand_p)
   22836              :         return false;
   22837              :     }
   22838        14802 :   else if (GET_MODE_SIZE (d->vmode) == 32)
   22839              :     {
   22840         7049 :       if (!TARGET_AVX)
   22841              :         return false;
   22842              :       /* For 32-byte modes allow even d->one_operand_p.
   22843              :          The lack of cross-lane shuffling in some instructions
   22844              :          might prevent a single insn shuffle.  */
   22845         7049 :       dfinal = *d;
   22846         7049 :       dfinal.testing_p = true;
   22847              :       /* If expand_vec_perm_interleave3 can expand this into
   22848              :          a 3 insn sequence, give up and let it be expanded as
   22849              :          3 insn sequence.  While that is one insn longer,
   22850              :          it doesn't need a memory operand and in the common
   22851              :          case that both interleave low and high permutations
   22852              :          with the same operands are adjacent needs 4 insns
   22853              :          for both after CSE.  */
   22854         7049 :       if (expand_vec_perm_interleave3 (&dfinal))
   22855              :         return false;
   22856              :     }
   22857              :   else
   22858              :     return false;
   22859              : 
   22860              :   /* Examine from whence the elements come.  */
   22861        89613 :   contents = 0;
   22862       680829 :   for (i = 0; i < nelt; ++i)
   22863       591216 :     contents |= HOST_WIDE_INT_1U << d->perm[i];
   22864              : 
   22865        89613 :   memset (remap, 0xff, sizeof (remap));
   22866        89613 :   dremap = *d;
   22867              : 
   22868        89613 :   if (GET_MODE_SIZE (d->vmode) == 4
   22869       171440 :       || GET_MODE_SIZE (d->vmode) == 8)
   22870              :     {
   22871        23345 :       unsigned HOST_WIDE_INT h1, h2, h3, h4;
   22872              : 
   22873              :       /* Split the two input vectors into 4 halves.  */
   22874        23345 :       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
   22875        23345 :       h2 = h1 << nelt2;
   22876        23345 :       h3 = h2 << nelt2;
   22877        23345 :       h4 = h3 << nelt2;
   22878              : 
   22879              :       /* If the elements from the low halves use interleave low,
   22880              :          and similarly for interleave high.  */
   22881        23345 :       if ((contents & (h1 | h3)) == contents)
   22882              :         {
   22883              :           /* punpckl* */
   22884         3247 :           for (i = 0; i < nelt2; ++i)
   22885              :             {
   22886         2292 :               remap[i] = i * 2;
   22887         2292 :               remap[i + nelt] = i * 2 + 1;
   22888         2292 :               dremap.perm[i * 2] = i;
   22889         2292 :               dremap.perm[i * 2 + 1] = i + nelt;
   22890              :             }
   22891              :         }
   22892        22390 :       else if ((contents & (h2 | h4)) == contents)
   22893              :         {
   22894              :           /* punpckh* */
   22895         2836 :           for (i = 0; i < nelt2; ++i)
   22896              :             {
   22897         2000 :               remap[i + nelt2] = i * 2;
   22898         2000 :               remap[i + nelt + nelt2] = i * 2 + 1;
   22899         2000 :               dremap.perm[i * 2] = i + nelt2;
   22900         2000 :               dremap.perm[i * 2 + 1] = i + nelt + nelt2;
   22901              :             }
   22902              :         }
   22903              :       else
   22904              :         return false;
   22905              :     }
   22906       132536 :   else if (GET_MODE_SIZE (d->vmode) == 16)
   22907              :     {
   22908        59437 :       unsigned HOST_WIDE_INT h1, h2, h3, h4;
   22909              : 
   22910              :       /* Split the two input vectors into 4 halves.  */
   22911        59437 :       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
   22912        59437 :       h2 = h1 << nelt2;
   22913        59437 :       h3 = h2 << nelt2;
   22914        59437 :       h4 = h3 << nelt2;
   22915              : 
   22916              :       /* If the elements from the low halves use interleave low, and similarly
   22917              :          for interleave high.  If the elements are from mis-matched halves, we
   22918              :          can use shufps for V4SF/V4SI or do a DImode shuffle.  */
   22919        59437 :       if ((contents & (h1 | h3)) == contents)
   22920              :         {
   22921              :           /* punpckl* */
   22922         5923 :           for (i = 0; i < nelt2; ++i)
   22923              :             {
   22924         4382 :               remap[i] = i * 2;
   22925         4382 :               remap[i + nelt] = i * 2 + 1;
   22926         4382 :               dremap.perm[i * 2] = i;
   22927         4382 :               dremap.perm[i * 2 + 1] = i + nelt;
   22928              :             }
   22929         1541 :           if (!TARGET_SSE2 && d->vmode == V4SImode)
   22930            0 :             dremap.vmode = V4SFmode;
   22931              :         }
   22932        57896 :       else if ((contents & (h2 | h4)) == contents)
   22933              :         {
   22934              :           /* punpckh* */
   22935         5130 :           for (i = 0; i < nelt2; ++i)
   22936              :             {
   22937         3762 :               remap[i + nelt2] = i * 2;
   22938         3762 :               remap[i + nelt + nelt2] = i * 2 + 1;
   22939         3762 :               dremap.perm[i * 2] = i + nelt2;
   22940         3762 :               dremap.perm[i * 2 + 1] = i + nelt + nelt2;
   22941              :             }
   22942         1368 :           if (!TARGET_SSE2 && d->vmode == V4SImode)
   22943            0 :             dremap.vmode = V4SFmode;
   22944              :         }
   22945        56528 :       else if ((contents & (h1 | h4)) == contents)
   22946              :         {
   22947              :           /* shufps */
   22948         2537 :           for (i = 0; i < nelt2; ++i)
   22949              :             {
   22950         1828 :               remap[i] = i;
   22951         1828 :               remap[i + nelt + nelt2] = i + nelt2;
   22952         1828 :               dremap.perm[i] = i;
   22953         1828 :               dremap.perm[i + nelt2] = i + nelt + nelt2;
   22954              :             }
   22955          709 :           if (nelt != 4)
   22956              :             {
   22957              :               /* shufpd */
   22958           69 :               dremap.vmode = V2DImode;
   22959           69 :               dremap.nelt = 2;
   22960           69 :               dremap.perm[0] = 0;
   22961           69 :               dremap.perm[1] = 3;
   22962              :             }
   22963              :         }
   22964        55819 :       else if ((contents & (h2 | h3)) == contents)
   22965              :         {
   22966              :           /* shufps */
   22967         3483 :           for (i = 0; i < nelt2; ++i)
   22968              :             {
   22969         2458 :               remap[i + nelt2] = i;
   22970         2458 :               remap[i + nelt] = i + nelt2;
   22971         2458 :               dremap.perm[i] = i + nelt2;
   22972         2458 :               dremap.perm[i + nelt2] = i + nelt;
   22973              :             }
   22974         1025 :           if (nelt != 4)
   22975              :             {
   22976              :               /* shufpd */
   22977           76 :               dremap.vmode = V2DImode;
   22978           76 :               dremap.nelt = 2;
   22979           76 :               dremap.perm[0] = 1;
   22980           76 :               dremap.perm[1] = 2;
   22981              :             }
   22982              :         }
   22983              :       else
   22984              :         return false;
   22985              :     }
   22986              :   else
   22987              :     {
   22988         6831 :       unsigned int nelt4 = nelt / 4, nzcnt = 0;
   22989         6831 :       unsigned HOST_WIDE_INT q[8];
   22990         6831 :       unsigned int nonzero_halves[4];
   22991              : 
   22992              :       /* Split the two input vectors into 8 quarters.  */
   22993         6831 :       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
   22994        54648 :       for (i = 1; i < 8; ++i)
   22995        47817 :         q[i] = q[0] << (nelt4 * i);
   22996        34155 :       for (i = 0; i < 4; ++i)
   22997        27324 :         if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
   22998              :           {
   22999        24487 :             nonzero_halves[nzcnt] = i;
   23000        24487 :             ++nzcnt;
   23001              :           }
   23002              : 
   23003         6831 :       if (nzcnt == 1)
   23004              :         {
   23005          221 :           gcc_assert (d->one_operand_p);
   23006          221 :           nonzero_halves[1] = nonzero_halves[0];
   23007          221 :           same_halves = true;
   23008              :         }
   23009         6610 :       else if (d->one_operand_p)
   23010              :         {
   23011           23 :           gcc_assert (nonzero_halves[0] == 0);
   23012           23 :           gcc_assert (nonzero_halves[1] == 1);
   23013              :         }
   23014              : 
   23015         6831 :       if (nzcnt <= 2)
   23016              :         {
   23017          544 :           if (d->perm[0] / nelt2 == nonzero_halves[1])
   23018              :             {
   23019              :               /* Attempt to increase the likelihood that dfinal
   23020              :                  shuffle will be intra-lane.  */
   23021          229 :               std::swap (nonzero_halves[0], nonzero_halves[1]);
   23022              :             }
   23023              : 
   23024              :           /* vperm2f128 or vperm2i128.  */
   23025         3526 :           for (i = 0; i < nelt2; ++i)
   23026              :             {
   23027         2982 :               remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
   23028         2982 :               remap[i + nonzero_halves[0] * nelt2] = i;
   23029         2982 :               dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
   23030         2982 :               dremap.perm[i] = i + nonzero_halves[0] * nelt2;
   23031              :             }
   23032              : 
   23033          544 :           if (d->vmode != V8SFmode
   23034              :               && d->vmode != V4DFmode
   23035              :               && d->vmode != V8SImode)
   23036              :             {
   23037          132 :               dremap.vmode = V8SImode;
   23038          132 :               dremap.nelt = 8;
   23039          660 :               for (i = 0; i < 4; ++i)
   23040              :                 {
   23041          528 :                   dremap.perm[i] = i + nonzero_halves[0] * 4;
   23042          528 :                   dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
   23043              :                 }
   23044              :             }
   23045              :         }
   23046         6287 :       else if (d->one_operand_p)
   23047         5822 :         return false;
   23048         6287 :       else if (TARGET_AVX2
   23049         2600 :                && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
   23050              :         {
   23051              :           /* vpunpckl* */
   23052          491 :           for (i = 0; i < nelt4; ++i)
   23053              :             {
   23054          247 :               remap[i] = i * 2;
   23055          247 :               remap[i + nelt] = i * 2 + 1;
   23056          247 :               remap[i + nelt2] = i * 2 + nelt2;
   23057          247 :               remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
   23058          247 :               dremap.perm[i * 2] = i;
   23059          247 :               dremap.perm[i * 2 + 1] = i + nelt;
   23060          247 :               dremap.perm[i * 2 + nelt2] = i + nelt2;
   23061          247 :               dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
   23062              :             }
   23063              :         }
   23064         6043 :       else if (TARGET_AVX2
   23065         2356 :                && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
   23066              :         {
   23067              :           /* vpunpckh* */
   23068          445 :           for (i = 0; i < nelt4; ++i)
   23069              :             {
   23070          224 :               remap[i + nelt4] = i * 2;
   23071          224 :               remap[i + nelt + nelt4] = i * 2 + 1;
   23072          224 :               remap[i + nelt2 + nelt4] = i * 2 + nelt2;
   23073          224 :               remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
   23074          224 :               dremap.perm[i * 2] = i + nelt4;
   23075          224 :               dremap.perm[i * 2 + 1] = i + nelt + nelt4;
   23076          224 :               dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
   23077          224 :               dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
   23078              :             }
   23079              :         }
   23080              :       else
   23081              :         return false;
   23082              :     }
   23083              : 
   23084              :   /* Use the remapping array set up above to move the elements from their
   23085              :      swizzled locations into their final destinations.  */
   23086         7443 :   dfinal = *d;
   23087        48735 :   for (i = 0; i < nelt; ++i)
   23088              :     {
   23089        41292 :       unsigned e = remap[d->perm[i]];
   23090        41292 :       gcc_assert (e < nelt);
   23091              :       /* If same_halves is true, both halves of the remapped vector are the
   23092              :          same.  Avoid cross-lane accesses if possible.  */
   23093        41292 :       if (same_halves && i >= nelt2)
   23094              :         {
   23095          816 :           gcc_assert (e < nelt2);
   23096          816 :           dfinal.perm[i] = e + nelt2;
   23097              :         }
   23098              :       else
   23099        40476 :         dfinal.perm[i] = e;
   23100              :     }
   23101         7443 :   if (!d->testing_p)
   23102              :     {
   23103         2775 :       dremap.target = gen_reg_rtx (dremap.vmode);
   23104         2775 :       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   23105              :     }
   23106         7443 :   dfinal.op1 = dfinal.op0;
   23107         7443 :   dfinal.one_operand_p = true;
   23108              : 
   23109              :   /* Test if the final remap can be done with a single insn.  For V4SFmode or
   23110              :      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
   23111         7443 :   start_sequence ();
   23112         7443 :   ok = expand_vec_perm_1 (&dfinal);
   23113         7443 :   seq = end_sequence ();
   23114              : 
   23115         7443 :   if (!ok)
   23116              :     return false;
   23117              : 
   23118         6387 :   if (d->testing_p)
   23119              :     return true;
   23120              : 
   23121         2736 :   if (dremap.vmode != dfinal.vmode)
   23122              :     {
   23123           55 :       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
   23124           55 :       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
   23125              :     }
   23126              : 
   23127         2736 :   ok = expand_vec_perm_1 (&dremap);
   23128         2736 :   gcc_assert (ok);
   23129              : 
   23130         2736 :   emit_insn (seq);
   23131         2736 :   return true;
   23132              : }
   23133              : 
   23134              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23135              :    a single vector cross-lane permutation into vpermq followed
   23136              :    by any of the single insn permutations.  */
   23137              : 
   23138              : static bool
   23139        88896 : expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
   23140              : {
   23141        88896 :   struct expand_vec_perm_d dremap, dfinal;
   23142        88896 :   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
   23143        88896 :   unsigned contents[2];
   23144        88896 :   bool ok;
   23145              : 
   23146        88896 :   if (!(TARGET_AVX2
   23147         4277 :         && (d->vmode == V32QImode || d->vmode == V16HImode)
   23148          495 :         && d->one_operand_p))
   23149              :     return false;
   23150              : 
   23151            7 :   contents[0] = 0;
   23152            7 :   contents[1] = 0;
   23153          103 :   for (i = 0; i < nelt2; ++i)
   23154              :     {
   23155           96 :       contents[0] |= 1u << (d->perm[i] / nelt4);
   23156           96 :       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
   23157              :     }
   23158              : 
   23159            7 :   for (i = 0; i < 2; ++i)
   23160              :     {
   23161              :       unsigned int cnt = 0;
   23162           21 :       for (j = 0; j < 4; ++j)
   23163           21 :         if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
   23164              :           return false;
   23165              :     }
   23166              : 
   23167            0 :   if (d->testing_p)
   23168              :     return true;
   23169              : 
   23170            0 :   dremap = *d;
   23171            0 :   dremap.vmode = V4DImode;
   23172            0 :   dremap.nelt = 4;
   23173            0 :   dremap.target = gen_reg_rtx (V4DImode);
   23174            0 :   dremap.op0 = gen_lowpart (V4DImode, d->op0);
   23175            0 :   dremap.op1 = dremap.op0;
   23176            0 :   dremap.one_operand_p = true;
   23177            0 :   for (i = 0; i < 2; ++i)
   23178              :     {
   23179              :       unsigned int cnt = 0;
   23180            0 :       for (j = 0; j < 4; ++j)
   23181            0 :         if ((contents[i] & (1u << j)) != 0)
   23182            0 :           dremap.perm[2 * i + cnt++] = j;
   23183            0 :       for (; cnt < 2; ++cnt)
   23184            0 :         dremap.perm[2 * i + cnt] = 0;
   23185              :     }
   23186              : 
   23187            0 :   dfinal = *d;
   23188            0 :   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   23189            0 :   dfinal.op1 = dfinal.op0;
   23190            0 :   dfinal.one_operand_p = true;
   23191            0 :   for (i = 0, j = 0; i < nelt; ++i)
   23192              :     {
   23193            0 :       if (i == nelt2)
   23194            0 :         j = 2;
   23195            0 :       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
   23196            0 :       if ((d->perm[i] / nelt4) == dremap.perm[j])
   23197              :         ;
   23198            0 :       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
   23199            0 :         dfinal.perm[i] |= nelt4;
   23200              :       else
   23201            0 :         gcc_unreachable ();
   23202              :     }
   23203              : 
   23204            0 :   ok = expand_vec_perm_1 (&dremap);
   23205            0 :   gcc_assert (ok);
   23206              : 
   23207            0 :   ok = expand_vec_perm_1 (&dfinal);
   23208            0 :   gcc_assert (ok);
   23209              : 
   23210              :   return true;
   23211              : }
   23212              : 
   23213              : static bool canonicalize_perm (struct expand_vec_perm_d *d);
   23214              : 
   23215              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
   23216              :    a vector permutation using two instructions, vperm2f128 resp.
   23217              :    vperm2i128 followed by any single in-lane permutation.  */
   23218              : 
   23219              : static bool
   23220        88896 : expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
   23221              : {
   23222        88896 :   struct expand_vec_perm_d dfirst, dsecond;
   23223        88896 :   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
   23224        88896 :   bool ok;
   23225              : 
   23226        88896 :   if (!TARGET_AVX
   23227        23226 :       || GET_MODE_SIZE (d->vmode) != 32
   23228        95109 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
   23229              :     return false;
   23230              : 
   23231         6029 :   dsecond = *d;
   23232         6029 :   dsecond.one_operand_p = false;
   23233         6029 :   dsecond.testing_p = true;
   23234              : 
   23235              :   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
   23236              :      immediate.  For perm < 16 the second permutation uses
   23237              :      d->op0 as first operand, for perm >= 16 it uses d->op1
   23238              :      as first operand.  The second operand is the result of
   23239              :      vperm2[fi]128.  */
   23240       197735 :   for (perm = 0; perm < 32; perm++)
   23241              :     {
   23242              :       /* Ignore permutations which do not move anything cross-lane.  */
   23243       191785 :       if (perm < 16)
   23244              :         {
   23245              :           /* The second shuffle for e.g. V4DFmode has
   23246              :              0123 and ABCD operands.
   23247              :              Ignore AB23, as 23 is already in the second lane
   23248              :              of the first operand.  */
   23249        96126 :           if ((perm & 0xc) == (1 << 2)) continue;
   23250              :           /* And 01CD, as 01 is in the first lane of the first
   23251              :              operand.  */
   23252        72086 :           if ((perm & 3) == 0) continue;
   23253              :           /* And 4567, as then the vperm2[fi]128 doesn't change
   23254              :              anything on the original 4567 second operand.  */
   23255        54049 :           if ((perm & 0xf) == ((3 << 2) | 2)) continue;
   23256              :         }
   23257              :       else
   23258              :         {
   23259              :           /* The second shuffle for e.g. V4DFmode has
   23260              :              4567 and ABCD operands.
   23261              :              Ignore AB67, as 67 is already in the second lane
   23262              :              of the first operand.  */
   23263        95659 :           if ((perm & 0xc) == (3 << 2)) continue;
   23264              :           /* And 45CD, as 45 is in the first lane of the first
   23265              :              operand.  */
   23266        71859 :           if ((perm & 3) == 2) continue;
   23267              :           /* And 0123, as then the vperm2[fi]128 doesn't change
   23268              :              anything on the original 0123 first operand.  */
   23269        53918 :           if ((perm & 0xf) == (1 << 2)) continue;
   23270              :         }
   23271              : 
   23272       277596 :       for (i = 0; i < nelt; i++)
   23273              :         {
   23274       275777 :           j = d->perm[i] / nelt2;
   23275       510607 :           if (j == ((perm >> (2 * (i >= nelt2))) & 3))
   23276        67089 :             dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
   23277       349601 :           else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
   23278       114534 :             dsecond.perm[i] = d->perm[i] & (nelt - 1);
   23279              :           else
   23280              :             break;
   23281              :         }
   23282              : 
   23283        95973 :       if (i == nelt)
   23284              :         {
   23285         1819 :           start_sequence ();
   23286         1819 :           ok = expand_vec_perm_1 (&dsecond);
   23287         1819 :           end_sequence ();
   23288              :         }
   23289              :       else
   23290              :         ok = false;
   23291              : 
   23292         1819 :       if (ok)
   23293              :         {
   23294           64 :           if (d->testing_p)
   23295              :             return true;
   23296              : 
   23297              :           /* Found a usable second shuffle.  dfirst will be
   23298              :              vperm2f128 on d->op0 and d->op1.  */
   23299           46 :           dsecond.testing_p = false;
   23300           46 :           dfirst = *d;
   23301           46 :           dfirst.target = gen_reg_rtx (d->vmode);
   23302          270 :           for (i = 0; i < nelt; i++)
   23303          448 :             dfirst.perm[i] = (i & (nelt2 - 1))
   23304          336 :                              + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
   23305              : 
   23306           46 :           canonicalize_perm (&dfirst);
   23307           46 :           ok = expand_vec_perm_1 (&dfirst);
   23308           46 :           gcc_assert (ok);
   23309              : 
   23310              :           /* And dsecond is some single insn shuffle, taking
   23311              :              d->op0 and result of vperm2f128 (if perm < 16) or
   23312              :              d->op1 and result of vperm2f128 (otherwise).  */
   23313           46 :           if (perm >= 16)
   23314           46 :             dsecond.op0 = dsecond.op1;
   23315           46 :           dsecond.op1 = dfirst.target;
   23316              : 
   23317           46 :           ok = expand_vec_perm_1 (&dsecond);
   23318           46 :           gcc_assert (ok);
   23319              : 
   23320              :           return true;
   23321              :         }
   23322              : 
   23323              :       /* For one operand, the only useful vperm2f128 permutation is 0x01
   23324              :          aka lanes swap.  */
   23325        95909 :       if (d->one_operand_p)
   23326              :         return false;
   23327              :     }
   23328              : 
   23329              :   return false;
   23330              : }
   23331              : 
   23332              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23333              :    a two vector permutation using 2 intra-lane interleave insns
   23334              :    and cross-lane shuffle for 32-byte vectors.  */
   23335              : 
   23336              : static bool
   23337        34475 : expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
   23338              : {
   23339        34475 :   unsigned i, nelt;
   23340        34475 :   rtx (*gen) (rtx, rtx, rtx);
   23341              : 
   23342        34475 :   if (d->one_operand_p)
   23343              :     return false;
   23344        33191 :   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
   23345              :     ;
   23346        24984 :   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
   23347              :     ;
   23348              :   else
   23349              :     return false;
   23350              : 
   23351         9717 :   nelt = d->nelt;
   23352         9717 :   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
   23353              :     return false;
   23354         9877 :   for (i = 0; i < nelt; i += 2)
   23355         9521 :     if (d->perm[i] != d->perm[0] + i / 2
   23356         8648 :         || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
   23357              :       return false;
   23358              : 
   23359          356 :   if (d->testing_p)
   23360              :     return true;
   23361              : 
   23362           56 :   switch (d->vmode)
   23363              :     {
   23364           32 :     case E_V32QImode:
   23365           32 :       if (d->perm[0])
   23366              :         gen = gen_vec_interleave_highv32qi;
   23367              :       else
   23368           16 :         gen = gen_vec_interleave_lowv32qi;
   23369              :       break;
   23370           18 :     case E_V16HImode:
   23371           18 :       if (d->perm[0])
   23372              :         gen = gen_vec_interleave_highv16hi;
   23373              :       else
   23374            9 :         gen = gen_vec_interleave_lowv16hi;
   23375              :       break;
   23376            0 :     case E_V8SImode:
   23377            0 :       if (d->perm[0])
   23378              :         gen = gen_vec_interleave_highv8si;
   23379              :       else
   23380            0 :         gen = gen_vec_interleave_lowv8si;
   23381              :       break;
   23382            4 :     case E_V4DImode:
   23383            4 :       if (d->perm[0])
   23384              :         gen = gen_vec_interleave_highv4di;
   23385              :       else
   23386            2 :         gen = gen_vec_interleave_lowv4di;
   23387              :       break;
   23388            2 :     case E_V8SFmode:
   23389            2 :       if (d->perm[0])
   23390              :         gen = gen_vec_interleave_highv8sf;
   23391              :       else
   23392            1 :         gen = gen_vec_interleave_lowv8sf;
   23393              :       break;
   23394            0 :     case E_V4DFmode:
   23395            0 :       if (d->perm[0])
   23396              :         gen = gen_vec_interleave_highv4df;
   23397              :       else
   23398            0 :         gen = gen_vec_interleave_lowv4df;
   23399              :       break;
   23400            0 :     default:
   23401            0 :       gcc_unreachable ();
   23402              :     }
   23403              : 
   23404           56 :   emit_insn (gen (d->target, d->op0, d->op1));
   23405           56 :   return true;
   23406              : }
   23407              : 
   23408              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23409              :    a single vector permutation using a single intra-lane vector
   23410              :    permutation, vperm2f128 swapping the lanes and vblend* insn blending
   23411              :    the non-swapped and swapped vectors together.  */
   23412              : 
   23413              : static bool
   23414        27288 : expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
   23415              : {
   23416        27288 :   struct expand_vec_perm_d dfirst, dsecond;
   23417        27288 :   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
   23418        27288 :   rtx_insn *seq;
   23419        27288 :   bool ok;
   23420        27288 :   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
   23421              : 
   23422        27288 :   if (!TARGET_AVX
   23423         3508 :       || TARGET_AVX2
   23424         2030 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
   23425         1846 :       || !d->one_operand_p)
   23426              :     return false;
   23427              : 
   23428            0 :   dfirst = *d;
   23429            0 :   for (i = 0; i < nelt; i++)
   23430            0 :     dfirst.perm[i] = 0xff;
   23431            0 :   for (i = 0, msk = 0; i < nelt; i++)
   23432              :     {
   23433            0 :       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
   23434            0 :       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
   23435              :         return false;
   23436            0 :       dfirst.perm[j] = d->perm[i];
   23437            0 :       if (j != i)
   23438            0 :         msk |= (1 << i);
   23439              :     }
   23440            0 :   for (i = 0; i < nelt; i++)
   23441            0 :     if (dfirst.perm[i] == 0xff)
   23442            0 :       dfirst.perm[i] = i;
   23443              : 
   23444            0 :   if (!d->testing_p)
   23445            0 :     dfirst.target = gen_reg_rtx (dfirst.vmode);
   23446              : 
   23447            0 :   start_sequence ();
   23448            0 :   ok = expand_vec_perm_1 (&dfirst);
   23449            0 :   seq = end_sequence ();
   23450              : 
   23451            0 :   if (!ok)
   23452              :     return false;
   23453              : 
   23454            0 :   if (d->testing_p)
   23455              :     return true;
   23456              : 
   23457            0 :   emit_insn (seq);
   23458              : 
   23459            0 :   dsecond = *d;
   23460            0 :   dsecond.op0 = dfirst.target;
   23461            0 :   dsecond.op1 = dfirst.target;
   23462            0 :   dsecond.one_operand_p = true;
   23463            0 :   dsecond.target = gen_reg_rtx (dsecond.vmode);
   23464            0 :   for (i = 0; i < nelt; i++)
   23465            0 :     dsecond.perm[i] = i ^ nelt2;
   23466              : 
   23467            0 :   ok = expand_vec_perm_1 (&dsecond);
   23468            0 :   gcc_assert (ok);
   23469              : 
   23470            0 :   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
   23471            0 :   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
   23472            0 :   return true;
   23473              : }
   23474              : 
   23475              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23476              :    a two vector permutation using two single vector permutations and
   23477              :    {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
   23478              :    of dfirst or dsecond is identity permutation.  */
   23479              : 
   23480              : static bool
   23481       114364 : expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
   23482              : {
   23483       114364 :   unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
   23484       114364 :   struct expand_vec_perm_d dfirst, dsecond, dfinal;
   23485       114364 :   bool ident1 = true, ident2 = true;
   23486              : 
   23487       114364 :   if (d->one_operand_p)
   23488              :     return false;
   23489              : 
   23490       207694 :   if (GET_MODE_SIZE (d->vmode) == 16)
   23491              :     {
   23492        62423 :       if (!TARGET_SSE)
   23493              :         return false;
   23494        62423 :       if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
   23495              :         return false;
   23496              :     }
   23497        82848 :   else if (GET_MODE_SIZE (d->vmode) == 32)
   23498              :     {
   23499         8599 :       if (!TARGET_AVX)
   23500              :         return false;
   23501         8599 :       if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
   23502              :         return false;
   23503              :       lane = nelt2;
   23504              :     }
   23505              :   else
   23506              :     return false;
   23507              : 
   23508       232099 :   for (i = 1; i < nelt; i++)
   23509       199167 :     if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
   23510              :       return false;
   23511              : 
   23512        32932 :   dfirst = *d;
   23513        32932 :   dsecond = *d;
   23514        32932 :   dfinal = *d;
   23515        32932 :   dfirst.op1 = dfirst.op0;
   23516        32932 :   dfirst.one_operand_p = true;
   23517        32932 :   dsecond.op0 = dsecond.op1;
   23518        32932 :   dsecond.one_operand_p = true;
   23519              : 
   23520       217684 :   for (i = 0; i < nelt; i++)
   23521       184752 :     if (d->perm[i] >= nelt)
   23522              :       {
   23523        92376 :         dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
   23524        92376 :         if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
   23525        83899 :           ident2 = false;
   23526        92376 :         dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
   23527        92376 :           = d->perm[i] - nelt;
   23528              :       }
   23529              :     else
   23530              :       {
   23531        92376 :         dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
   23532        92376 :         if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
   23533        75522 :           ident1 = false;
   23534        92376 :         dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
   23535              :       }
   23536              : 
   23537        32932 :   if (two_insn && !ident1 && !ident2)
   23538              :     return false;
   23539              : 
   23540         3957 :   if (!d->testing_p)
   23541              :     {
   23542          214 :       if (!ident1)
   23543          144 :         dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
   23544          214 :       if (!ident2)
   23545          148 :         dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
   23546          214 :       if (d->perm[0] >= nelt)
   23547            0 :         std::swap (dfinal.op0, dfinal.op1);
   23548              :     }
   23549              : 
   23550         3957 :   bool ok;
   23551         3957 :   rtx_insn *seq1 = NULL, *seq2 = NULL;
   23552              : 
   23553         3957 :   if (!ident1)
   23554              :     {
   23555         2645 :       start_sequence ();
   23556         2645 :       ok = expand_vec_perm_1 (&dfirst);
   23557         2645 :       seq1 = end_sequence ();
   23558              : 
   23559         2645 :       if (!ok)
   23560              :         return false;
   23561              :     }
   23562              : 
   23563         2168 :   if (!ident2)
   23564              :     {
   23565         2074 :       start_sequence ();
   23566         2074 :       ok = expand_vec_perm_1 (&dsecond);
   23567         2074 :       seq2 = end_sequence ();
   23568              : 
   23569         2074 :       if (!ok)
   23570              :         return false;
   23571              :     }
   23572              : 
   23573          602 :   if (d->testing_p)
   23574              :     return true;
   23575              : 
   23576          680 :   for (i = 0; i < nelt; i++)
   23577              :     {
   23578          544 :       dfinal.perm[i] = i / 2;
   23579          544 :       if (i >= lane)
   23580            4 :         dfinal.perm[i] += lane / 2;
   23581          544 :       if ((i & 1) != 0)
   23582          272 :         dfinal.perm[i] += nelt;
   23583              :     }
   23584          136 :   emit_insn (seq1);
   23585          136 :   emit_insn (seq2);
   23586          136 :   ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
   23587              :                                dfinal.perm, dfinal.nelt, false);
   23588          136 :   gcc_assert (ok);
   23589              :   return true;
   23590              : }
   23591              : 
   23592              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23593              :    the permutation using two single vector permutations and the SSE4_1 pblendv
   23594              :    instruction.  If two_insn, succeed only if one of dfirst or dsecond is
   23595              :    identity permutation.  */
   23596              : 
   23597              : static bool
   23598       113762 : expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   23599              : {
   23600       113762 :   unsigned i, nelt = d->nelt;
   23601       113762 :   struct expand_vec_perm_d dfirst, dsecond, dfinal;
   23602       113762 :   machine_mode vmode = d->vmode;
   23603       113762 :   bool ident1 = true, ident2 = true;
   23604              : 
   23605              :   /* Use the same checks as in expand_vec_perm_blend.  */
   23606       113762 :   if (d->one_operand_p)
   23607              :     return false;
   23608       107775 :   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   23609              :     ;
   23610       100503 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   23611              :     ;
   23612        95262 :   else if (TARGET_SSE4_1
   23613       104678 :            && (GET_MODE_SIZE (vmode) == 16
   23614         8190 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   23615         2482 :                || GET_MODE_SIZE (vmode) == 4))
   23616              :     ;
   23617              :   else
   23618              :     return false;
   23619              : 
   23620        16657 :   dfirst = *d;
   23621        16657 :   dsecond = *d;
   23622        16657 :   dfinal = *d;
   23623        16657 :   dfirst.op1 = dfirst.op0;
   23624        16657 :   dfirst.one_operand_p = true;
   23625        16657 :   dsecond.op0 = dsecond.op1;
   23626        16657 :   dsecond.one_operand_p = true;
   23627              : 
   23628       137649 :   for (i = 0; i < nelt; ++i)
   23629       120992 :     if (d->perm[i] >= nelt)
   23630              :       {
   23631        60832 :         dfirst.perm[i] = 0xff;
   23632        60832 :         dsecond.perm[i] = d->perm[i] - nelt;
   23633        60832 :         if (d->perm[i] != i + nelt)
   23634       120992 :           ident2 = false;
   23635              :       }
   23636              :     else
   23637              :       {
   23638        60160 :         dsecond.perm[i] = 0xff;
   23639        60160 :         dfirst.perm[i] = d->perm[i];
   23640        60160 :         if (d->perm[i] != i)
   23641       120992 :           ident1 = false;
   23642              :       }
   23643              : 
   23644        16657 :   if (two_insn && !ident1 && !ident2)
   23645              :     return false;
   23646              : 
   23647              :   /* For now.  Ideally treat 0xff as a wildcard.  */
   23648        57247 :   for (i = 0; i < nelt; ++i)
   23649        51036 :     if (dfirst.perm[i] == 0xff)
   23650              :       {
   23651        26620 :         if (GET_MODE_SIZE (vmode) == 32
   23652        26620 :             && dfirst.perm[i ^ (nelt / 2)] != 0xff)
   23653        14868 :           dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
   23654              :         else
   23655        11752 :           dfirst.perm[i] = i;
   23656              :       }
   23657              :     else
   23658              :       {
   23659        24416 :         if (GET_MODE_SIZE (vmode) == 32
   23660        24416 :             && dsecond.perm[i ^ (nelt / 2)] != 0xff)
   23661        13292 :           dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
   23662              :         else
   23663        11124 :           dsecond.perm[i] = i;
   23664              :       }
   23665              : 
   23666         6211 :   if (!d->testing_p)
   23667              :     {
   23668         2403 :       if (!ident1)
   23669         2279 :         dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
   23670         2403 :       if (!ident2)
   23671         1091 :         dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
   23672              :     }
   23673              : 
   23674         6211 :   bool ok;
   23675         6211 :   rtx_insn *seq1 = NULL, *seq2 = NULL;
   23676              : 
   23677         6211 :   if (!ident1)
   23678              :     {
   23679         5622 :       start_sequence ();
   23680         5622 :       ok = expand_vec_perm_1 (&dfirst);
   23681         5622 :       seq1 = end_sequence ();
   23682              : 
   23683         5622 :       if (!ok)
   23684              :         return false;
   23685              :     }
   23686              : 
   23687         4584 :   if (!ident2)
   23688              :     {
   23689         1489 :       start_sequence ();
   23690         1489 :       ok = expand_vec_perm_1 (&dsecond);
   23691         1489 :       seq2 = end_sequence ();
   23692              : 
   23693         1489 :       if (!ok)
   23694              :         return false;
   23695              :     }
   23696              : 
   23697         3995 :   if (d->testing_p)
   23698              :     return true;
   23699              : 
   23700        21825 :   for (i = 0; i < nelt; ++i)
   23701        19764 :     dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
   23702              : 
   23703         2061 :   emit_insn (seq1);
   23704         2061 :   emit_insn (seq2);
   23705         2061 :   ok = expand_vec_perm_blend (&dfinal);
   23706         2061 :   gcc_assert (ok);
   23707              :   return true;
   23708              : }
   23709              : 
   23710              : /* A subroutine of ix86_expand_vec_perm_const_1.
   23711              :    Implement a permutation with psrlw, psllw and por.
   23712              :    It handles case:
   23713              :    __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
   23714              :    __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
   23715              : 
   23716              : static bool
   23717        26148 : expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
   23718              : {
   23719        26148 :   unsigned i;
   23720        26148 :   rtx (*gen_shr) (rtx, rtx, rtx);
   23721        26148 :   rtx (*gen_shl) (rtx, rtx, rtx);
   23722        26148 :   rtx (*gen_or) (rtx, rtx, rtx);
   23723        26148 :   machine_mode mode = VOIDmode;
   23724              : 
   23725        26148 :   if (!TARGET_SSE2 || !d->one_operand_p)
   23726              :     return false;
   23727              : 
   23728         5237 :   switch (d->vmode)
   23729              :     {
   23730         1395 :     case E_V8QImode:
   23731         1395 :       if (!TARGET_MMX_WITH_SSE)
   23732              :         return false;
   23733              :       mode = V4HImode;
   23734              :       gen_shr = gen_lshrv4hi3;
   23735              :       gen_shl = gen_ashlv4hi3;
   23736              :       gen_or = gen_iorv4hi3;
   23737              :       break;
   23738              :     case E_V16QImode:
   23739              :       mode = V8HImode;
   23740              :       gen_shr = gen_lshrv8hi3;
   23741              :       gen_shl = gen_ashlv8hi3;
   23742              :       gen_or = gen_iorv8hi3;
   23743              :       break;
   23744              :     default: return false;
   23745              :     }
   23746              : 
   23747         3126 :   if (!rtx_equal_p (d->op0, d->op1))
   23748              :     return false;
   23749              : 
   23750        12166 :   for (i = 0; i < d->nelt; i += 2)
   23751        10728 :     if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
   23752              :       return false;
   23753              : 
   23754         1438 :   if (d->testing_p)
   23755              :     return true;
   23756              : 
   23757           26 :   rtx tmp1 = gen_reg_rtx (mode);
   23758           26 :   rtx tmp2 = gen_reg_rtx (mode);
   23759           26 :   rtx op0 = force_reg (d->vmode, d->op0);
   23760              : 
   23761           26 :   emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
   23762           26 :   emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
   23763           26 :   emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
   23764           26 :   emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
   23765           26 :   emit_insn (gen_or (tmp1, tmp1, tmp2));
   23766           26 :   emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
   23767              : 
   23768           26 :   return true;
   23769              : }
   23770              : 
   23771              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
   23772              :    permutation using two vperm2f128, followed by a vshufpd insn blending
   23773              :    the two vectors together.  */
   23774              : 
   23775              : static bool
   23776        29997 : expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
   23777              : {
   23778        29997 :   struct expand_vec_perm_d dfirst, dsecond, dthird;
   23779        29997 :   bool ok;
   23780              : 
   23781        29997 :   if (!TARGET_AVX || (d->vmode != V4DFmode))
   23782              :     return false;
   23783              : 
   23784         1277 :   if (d->testing_p)
   23785              :     return true;
   23786              : 
   23787          206 :   dfirst = *d;
   23788          206 :   dsecond = *d;
   23789          206 :   dthird = *d;
   23790              : 
   23791          206 :   dfirst.perm[0] = (d->perm[0] & ~1);
   23792          206 :   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
   23793          206 :   dfirst.perm[2] = (d->perm[2] & ~1);
   23794          206 :   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
   23795          206 :   dsecond.perm[0] = (d->perm[1] & ~1);
   23796          206 :   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
   23797          206 :   dsecond.perm[2] = (d->perm[3] & ~1);
   23798          206 :   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
   23799          206 :   dthird.perm[0] = (d->perm[0] % 2);
   23800          206 :   dthird.perm[1] = (d->perm[1] % 2) + 4;
   23801          206 :   dthird.perm[2] = (d->perm[2] % 2) + 2;
   23802          206 :   dthird.perm[3] = (d->perm[3] % 2) + 6;
   23803              : 
   23804          206 :   dfirst.target = gen_reg_rtx (dfirst.vmode);
   23805          206 :   dsecond.target = gen_reg_rtx (dsecond.vmode);
   23806          206 :   dthird.op0 = dfirst.target;
   23807          206 :   dthird.op1 = dsecond.target;
   23808          206 :   dthird.one_operand_p = false;
   23809              : 
   23810          206 :   canonicalize_perm (&dfirst);
   23811          206 :   canonicalize_perm (&dsecond);
   23812              : 
   23813          206 :   ok = expand_vec_perm_1 (&dfirst)
   23814          206 :        && expand_vec_perm_1 (&dsecond)
   23815          412 :        && expand_vec_perm_1 (&dthird);
   23816              : 
   23817            0 :   gcc_assert (ok);
   23818              : 
   23819              :   return true;
   23820              : }
   23821              : 
   23822              : static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
   23823              : 
   23824              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23825              :    a two vector permutation using two intra-lane vector
   23826              :    permutations, vperm2f128 swapping the lanes and vblend* insn blending
   23827              :    the non-swapped and swapped vectors together.  */
   23828              : 
   23829              : static bool
   23830        15790 : expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
   23831              : {
   23832        15790 :   struct expand_vec_perm_d dfirst, dsecond, dthird;
   23833        15790 :   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
   23834        15790 :   rtx_insn *seq1, *seq2;
   23835        15790 :   bool ok;
   23836        15790 :   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
   23837              : 
   23838        15790 :   if (!TARGET_AVX
   23839          990 :       || TARGET_AVX2
   23840          722 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
   23841          595 :       || d->one_operand_p)
   23842              :     return false;
   23843              : 
   23844          595 :   dfirst = *d;
   23845          595 :   dsecond = *d;
   23846         5355 :   for (i = 0; i < nelt; i++)
   23847              :     {
   23848         4760 :       dfirst.perm[i] = 0xff;
   23849         4760 :       dsecond.perm[i] = 0xff;
   23850              :     }
   23851         5355 :   for (i = 0, msk = 0; i < nelt; i++)
   23852              :     {
   23853         4760 :       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
   23854         4760 :       if (j == i)
   23855              :         {
   23856         3458 :           dfirst.perm[j] = d->perm[i];
   23857         5858 :           which1 |= (d->perm[i] < nelt ? 1 : 2);
   23858              :         }
   23859              :       else
   23860              :         {
   23861         1302 :           dsecond.perm[j] = d->perm[i];
   23862         1302 :           which2 |= (d->perm[i] < nelt ? 1 : 2);
   23863         1302 :           msk |= (1U << i);
   23864              :         }
   23865              :     }
   23866          595 :   if (msk == 0 || msk == (1U << nelt) - 1)
   23867              :     return false;
   23868              : 
   23869          595 :   if (!d->testing_p)
   23870              :     {
   23871           40 :       dfirst.target = gen_reg_rtx (dfirst.vmode);
   23872           40 :       dsecond.target = gen_reg_rtx (dsecond.vmode);
   23873              :     }
   23874              : 
   23875         5355 :   for (i = 0; i < nelt; i++)
   23876              :     {
   23877         4760 :       if (dfirst.perm[i] == 0xff)
   23878         1302 :         dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
   23879         4760 :       if (dsecond.perm[i] == 0xff)
   23880         3458 :         dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
   23881              :     }
   23882          595 :   canonicalize_perm (&dfirst);
   23883          595 :   start_sequence ();
   23884          595 :   ok = ix86_expand_vec_perm_const_1 (&dfirst);
   23885          595 :   seq1 = end_sequence ();
   23886              : 
   23887          595 :   if (!ok)
   23888              :     return false;
   23889              : 
   23890          595 :   canonicalize_perm (&dsecond);
   23891          595 :   start_sequence ();
   23892          595 :   ok = ix86_expand_vec_perm_const_1 (&dsecond);
   23893          595 :   seq2 = end_sequence ();
   23894              : 
   23895          595 :   if (!ok)
   23896              :     return false;
   23897              : 
   23898          595 :   if (d->testing_p)
   23899              :     return true;
   23900              : 
   23901           40 :   emit_insn (seq1);
   23902           40 :   emit_insn (seq2);
   23903              : 
   23904           40 :   dthird = *d;
   23905           40 :   dthird.op0 = dsecond.target;
   23906           40 :   dthird.op1 = dsecond.target;
   23907           40 :   dthird.one_operand_p = true;
   23908           40 :   dthird.target = gen_reg_rtx (dthird.vmode);
   23909          360 :   for (i = 0; i < nelt; i++)
   23910          320 :     dthird.perm[i] = i ^ nelt2;
   23911              : 
   23912           40 :   ok = expand_vec_perm_1 (&dthird);
   23913           40 :   gcc_assert (ok);
   23914              : 
   23915           40 :   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
   23916           40 :   emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
   23917           40 :   return true;
   23918              : }
   23919              : 
   23920              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
   23921              :    permutation with two pshufb insns and an ior.  We should have already
   23922              :    failed all two instruction sequences.  */
   23923              : 
   23924              : static bool
   23925        28741 : expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
   23926              : {
   23927        28741 :   rtx rperm[2][16], vperm, l, h, op, m128;
   23928        28741 :   unsigned int i, nelt, eltsz;
   23929        28741 :   machine_mode mode;
   23930        28741 :   rtx (*gen) (rtx, rtx, rtx);
   23931              : 
   23932        33459 :   if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
   23933         9346 :                         && GET_MODE_SIZE (d->vmode) != 8
   23934         9306 :                         && GET_MODE_SIZE (d->vmode) != 4))
   23935              :     return false;
   23936         1072 :   gcc_assert (!d->one_operand_p);
   23937              : 
   23938         1072 :   if (d->testing_p)
   23939              :     return true;
   23940              : 
   23941          202 :   switch (GET_MODE_SIZE (d->vmode))
   23942              :     {
   23943              :     case 4:
   23944              :       mode = V4QImode;
   23945              :       gen = gen_mmx_pshufbv4qi3;
   23946              :       break;
   23947           20 :     case 8:
   23948           20 :       mode = V8QImode;
   23949           20 :       gen = gen_mmx_pshufbv8qi3;
   23950           20 :       break;
   23951           45 :     case 16:
   23952           45 :       mode = V16QImode;
   23953           45 :       gen = gen_ssse3_pshufbv16qi3;
   23954           45 :       break;
   23955            0 :     default:
   23956            0 :       gcc_unreachable ();
   23957              :     }
   23958              : 
   23959          101 :   nelt = d->nelt;
   23960          101 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   23961              : 
   23962              :   /* Generate two permutation masks.  If the required element is within
   23963              :      the given vector it is shuffled into the proper lane.  If the required
   23964              :      element is in the other vector, force a zero into the lane by setting
   23965              :      bit 7 in the permutation mask.  */
   23966          101 :   m128 = GEN_INT (-128);
   23967         1029 :   for (i = 0; i < nelt; ++i)
   23968              :     {
   23969          928 :       unsigned j, k, e = d->perm[i];
   23970          928 :       unsigned which = (e >= nelt);
   23971          928 :       if (e >= nelt)
   23972          480 :         e -= nelt;
   23973              : 
   23974         1952 :       for (j = 0; j < eltsz; ++j)
   23975              :         {
   23976         1024 :           rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
   23977         1024 :           rperm[1-which][i*eltsz + j] = m128;
   23978              :         }
   23979              : 
   23980         9024 :       for (k = i*eltsz + j; k < 16; ++k)
   23981         8096 :         rperm[0][k] = rperm[1][k] = m128;
   23982              :     }
   23983              : 
   23984          101 :   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
   23985          101 :   vperm = force_reg (V16QImode, vperm);
   23986              : 
   23987          101 :   l = gen_reg_rtx (mode);
   23988          101 :   op = gen_lowpart (mode, d->op0);
   23989          101 :   emit_insn (gen (l, op, vperm));
   23990              : 
   23991          101 :   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
   23992          101 :   vperm = force_reg (V16QImode, vperm);
   23993              : 
   23994          101 :   h = gen_reg_rtx (mode);
   23995          101 :   op = gen_lowpart (mode, d->op1);
   23996          101 :   emit_insn (gen (h, op, vperm));
   23997              : 
   23998          101 :   op = d->target;
   23999          101 :   if (d->vmode != mode)
   24000           22 :     op = gen_reg_rtx (mode);
   24001          101 :   ix86_emit_vec_binop (IOR, mode, op, l, h);
   24002          101 :   if (op != d->target)
   24003           22 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   24004              : 
   24005              :   return true;
   24006              : }
   24007              : 
   24008              : /* Implement arbitrary permutation of one V32QImode and V16QImode operand
   24009              :    with two vpshufb insns, vpermq and vpor.  We should have already failed
   24010              :    all two or three instruction sequences.  */
   24011              : 
   24012              : static bool
   24013        23624 : expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
   24014              : {
   24015        23624 :   rtx rperm[2][32], vperm, l, h, hp, op, m128;
   24016        23624 :   unsigned int i, nelt, eltsz;
   24017              : 
   24018        23624 :   if (!TARGET_AVX2
   24019          401 :       || !d->one_operand_p
   24020          172 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   24021              :     return false;
   24022              : 
   24023            7 :   if (d->testing_p)
   24024              :     return true;
   24025              : 
   24026            7 :   nelt = d->nelt;
   24027            7 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   24028              : 
   24029              :   /* Generate two permutation masks.  If the required element is within
   24030              :      the same lane, it is shuffled in.  If the required element from the
   24031              :      other lane, force a zero by setting bit 7 in the permutation mask.
   24032              :      In the other mask the mask has non-negative elements if element
   24033              :      is requested from the other lane, but also moved to the other lane,
   24034              :      so that the result of vpshufb can have the two V2TImode halves
   24035              :      swapped.  */
   24036            7 :   m128 = GEN_INT (-128);
   24037          199 :   for (i = 0; i < nelt; ++i)
   24038              :     {
   24039          192 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   24040          192 :       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
   24041              : 
   24042          416 :       for (j = 0; j < eltsz; ++j)
   24043              :         {
   24044          224 :           rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
   24045          224 :           rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
   24046              :         }
   24047              :     }
   24048              : 
   24049            7 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
   24050            7 :   vperm = force_reg (V32QImode, vperm);
   24051              : 
   24052            7 :   h = gen_reg_rtx (V32QImode);
   24053            7 :   op = gen_lowpart (V32QImode, d->op0);
   24054            7 :   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
   24055              : 
   24056              :   /* Swap the 128-byte lanes of h into hp.  */
   24057            7 :   hp = gen_reg_rtx (V4DImode);
   24058            7 :   op = gen_lowpart (V4DImode, h);
   24059            7 :   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
   24060              :                                   const1_rtx));
   24061              : 
   24062            7 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
   24063            7 :   vperm = force_reg (V32QImode, vperm);
   24064              : 
   24065            7 :   l = gen_reg_rtx (V32QImode);
   24066            7 :   op = gen_lowpart (V32QImode, d->op0);
   24067            7 :   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
   24068              : 
   24069            7 :   op = d->target;
   24070            7 :   if (d->vmode != V32QImode)
   24071            2 :     op = gen_reg_rtx (V32QImode);
   24072            7 :   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
   24073            7 :   if (op != d->target)
   24074            2 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   24075              : 
   24076              :   return true;
   24077              : }
   24078              : 
   24079              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24080              :    and extract-odd permutations of two V32QImode and V16QImode operand
   24081              :    with two vpshufb insns, vpor and vpermq.  We should have already
   24082              :    failed all two or three instruction sequences.  */
   24083              : 
   24084              : static bool
   24085        23617 : expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
   24086              : {
   24087        23617 :   rtx rperm[2][32], vperm, l, h, ior, op, m128;
   24088        23617 :   unsigned int i, nelt, eltsz;
   24089              : 
   24090        23617 :   if (!TARGET_AVX2
   24091          394 :       || d->one_operand_p
   24092          229 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   24093              :     return false;
   24094              : 
   24095          112 :   for (i = 0; i < d->nelt; ++i)
   24096          112 :     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
   24097              :       return false;
   24098              : 
   24099            0 :   if (d->testing_p)
   24100              :     return true;
   24101              : 
   24102            0 :   nelt = d->nelt;
   24103            0 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   24104              : 
   24105              :   /* Generate two permutation masks.  In the first permutation mask
   24106              :      the first quarter will contain indexes for the first half
   24107              :      of the op0, the second quarter will contain bit 7 set, third quarter
   24108              :      will contain indexes for the second half of the op0 and the
   24109              :      last quarter bit 7 set.  In the second permutation mask
   24110              :      the first quarter will contain bit 7 set, the second quarter
   24111              :      indexes for the first half of the op1, the third quarter bit 7 set
   24112              :      and last quarter indexes for the second half of the op1.
   24113              :      I.e. the first mask e.g. for V32QImode extract even will be:
   24114              :      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
   24115              :      (all values masked with 0xf except for -128) and second mask
   24116              :      for extract even will be
   24117              :      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
   24118            0 :   m128 = GEN_INT (-128);
   24119            0 :   for (i = 0; i < nelt; ++i)
   24120              :     {
   24121            0 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   24122            0 :       unsigned which = d->perm[i] >= nelt;
   24123            0 :       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
   24124              : 
   24125            0 :       for (j = 0; j < eltsz; ++j)
   24126              :         {
   24127            0 :           rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
   24128            0 :           rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
   24129              :         }
   24130              :     }
   24131              : 
   24132            0 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
   24133            0 :   vperm = force_reg (V32QImode, vperm);
   24134              : 
   24135            0 :   l = gen_reg_rtx (V32QImode);
   24136            0 :   op = gen_lowpart (V32QImode, d->op0);
   24137            0 :   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
   24138              : 
   24139            0 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
   24140            0 :   vperm = force_reg (V32QImode, vperm);
   24141              : 
   24142            0 :   h = gen_reg_rtx (V32QImode);
   24143            0 :   op = gen_lowpart (V32QImode, d->op1);
   24144            0 :   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
   24145              : 
   24146            0 :   ior = gen_reg_rtx (V32QImode);
   24147            0 :   emit_insn (gen_iorv32qi3 (ior, l, h));
   24148              : 
   24149              :   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
   24150            0 :   op = gen_reg_rtx (V4DImode);
   24151            0 :   ior = gen_lowpart (V4DImode, ior);
   24152            0 :   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
   24153              :                                   const1_rtx, GEN_INT (3)));
   24154            0 :   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   24155              : 
   24156            0 :   return true;
   24157              : }
   24158              : 
   24159              : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
   24160              :    permutation (which is a bland) with and, andnot and or when pshufb is not available.
   24161              : 
   24162              :    It handles case:
   24163              :    __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
   24164              :    __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
   24165              : 
   24166              :    An element[i] must be chosen between op0[i] and op1[i] to satisfy the
   24167              :    requirement.
   24168              :  */
   24169              : 
   24170              : static bool
   24171        24710 : expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
   24172              : {
   24173        24710 :   rtx rperm[16], vperm;
   24174        24710 :   unsigned int i, nelt = d->nelt;
   24175              : 
   24176        24710 :   if (!TARGET_SSE2
   24177        24710 :       || d->one_operand_p
   24178        20911 :       || (d->vmode != V16QImode && d->vmode != V8HImode))
   24179              :     return false;
   24180              : 
   24181         7687 :   if (d->perm[0] != 0)
   24182              :     return false;
   24183              : 
   24184              :   /* The dest[i] must select an element between op0[i] and op1[i].  */
   24185        16329 :   for (i = 1; i < nelt; i++)
   24186        15255 :     if ((d->perm[i] % nelt) != i)
   24187              :       return false;
   24188              : 
   24189         1074 :   if (d->testing_p)
   24190              :      return true;
   24191              : 
   24192              :   /* Generates a blend mask for the operators AND and ANDNOT.  */
   24193          121 :   machine_mode inner_mode = GET_MODE_INNER (d->vmode);
   24194         1337 :   for (i = 0; i < nelt; i++)
   24195         1790 :     rperm[i] = (d->perm[i] <  nelt) ? CONSTM1_RTX (inner_mode)
   24196          574 :       : CONST0_RTX (inner_mode);
   24197              : 
   24198          121 :   vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
   24199          121 :   vperm = force_reg (d->vmode, vperm);
   24200              : 
   24201          121 :   ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
   24202              : 
   24203          121 :   return true;
   24204              : }
   24205              : 
   24206              : /* Implement permutation with pslldq + psrldq + por when pshufb is not
   24207              :    available.  */
   24208              : static bool
   24209        43798 : expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
   24210              : {
   24211        43798 :   unsigned i, nelt = d->nelt;
   24212        43798 :   unsigned start1, end1 = -1;
   24213        43798 :   machine_mode vmode = d->vmode, imode;
   24214        43798 :   int start2 = -1;
   24215        43798 :   bool clear_op0, clear_op1;
   24216        43798 :   unsigned inner_size;
   24217        43798 :   rtx op0, op1, dop1;
   24218        43798 :   rtx (*gen_vec_shr) (rtx, rtx, rtx);
   24219        43798 :   rtx (*gen_vec_shl) (rtx, rtx, rtx);
   24220              : 
   24221              :   /* pshufd can be used for V4SI/V2DI under TARGET_SSE2.  */
   24222        43798 :   if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
   24223              :     return false;
   24224              : 
   24225        13749 :   start1 = d->perm[0];
   24226        38776 :   for (i = 1; i < nelt; i++)
   24227              :     {
   24228        37954 :       if (d->perm[i] != d->perm[i-1] + 1
   24229        11524 :           || d->perm[i] == nelt)
   24230              :         {
   24231        26676 :           if (start2 == -1)
   24232              :             {
   24233        13749 :               start2 = d->perm[i];
   24234        13749 :               end1 = d->perm[i-1];
   24235              :             }
   24236              :           else
   24237              :             return false;
   24238              :         }
   24239              :     }
   24240              : 
   24241          822 :   clear_op0 = end1 != nelt - 1;
   24242          822 :   clear_op1 = start2 % nelt != 0;
   24243              :   /* pandn/pand is needed to clear upper/lower bits of op0/op1.  */
   24244          822 :   if (!pandn && (clear_op0 || clear_op1))
   24245              :     return false;
   24246              : 
   24247          528 :   if (d->testing_p)
   24248              :     return true;
   24249              : 
   24250           65 :   gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
   24251           24 :   gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
   24252           65 :   imode = GET_MODE_INNER (vmode);
   24253           65 :   inner_size = GET_MODE_BITSIZE (imode);
   24254           65 :   op0 = gen_reg_rtx (vmode);
   24255           65 :   op1 = gen_reg_rtx (vmode);
   24256              : 
   24257           65 :   if (start1)
   24258           61 :     emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
   24259              :   else
   24260            4 :     emit_move_insn (op0, d->op0);
   24261              : 
   24262           65 :   dop1 = d->op1;
   24263           65 :   if (d->one_operand_p)
   24264           44 :     dop1 = d->op0;
   24265              : 
   24266           65 :   int shl_offset = end1 - start1 + 1 - start2 % nelt;
   24267           65 :   if (shl_offset)
   24268           45 :     emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
   24269              :   else
   24270           20 :     emit_move_insn (op1, dop1);
   24271              : 
   24272              :   /* Clear lower/upper bits for op0/op1.  */
   24273           65 :   if (clear_op0 || clear_op1)
   24274              :     {
   24275              :       rtx vec[16];
   24276              :       rtx const_vec;
   24277              :       rtx clear;
   24278          664 :       for (i = 0; i != nelt; i++)
   24279              :         {
   24280          616 :           if (i < (end1 - start1 + 1))
   24281          251 :             vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
   24282              :           else
   24283          365 :             vec[i] = CONST0_RTX (imode);
   24284              :         }
   24285           48 :       const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
   24286           48 :       const_vec = validize_mem (force_const_mem (vmode, const_vec));
   24287           48 :       clear = force_reg (vmode, const_vec);
   24288              : 
   24289           48 :       if (clear_op0)
   24290           40 :         emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
   24291           48 :       if (clear_op1)
   24292           36 :         emit_move_insn (op1, gen_rtx_AND (vmode,
   24293              :                                           gen_rtx_NOT (vmode, clear),
   24294              :                                           op1));
   24295              :     }
   24296              : 
   24297           65 :   emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
   24298           65 :   return true;
   24299              : }
   24300              : 
   24301              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24302              :    and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
   24303              :    operands with two "and" and "pack" or two "shift" and "pack" insns.
   24304              :    We should have already failed all two instruction sequences.  */
   24305              : 
   24306              : static bool
   24307        45907 : expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
   24308              : {
   24309        45907 :   rtx op, dop0, dop1, t;
   24310        45907 :   unsigned i, odd, c, s, nelt = d->nelt;
   24311        45907 :   int pblendw_i = 0;
   24312        45907 :   bool end_perm = false;
   24313        45907 :   machine_mode half_mode;
   24314        45907 :   rtx (*gen_and) (rtx, rtx, rtx);
   24315        45907 :   rtx (*gen_pack) (rtx, rtx, rtx);
   24316        45907 :   rtx (*gen_shift) (rtx, rtx, rtx);
   24317              : 
   24318        45907 :   if (d->one_operand_p)
   24319              :     return false;
   24320              : 
   24321        40627 :   switch (d->vmode)
   24322              :     {
   24323         4222 :     case E_V4HImode:
   24324              :       /* Required for "pack".  */
   24325         4222 :       if (!TARGET_SSE4_1)
   24326              :         return false;
   24327              :       c = 0xffff;
   24328              :       s = 16;
   24329              :       half_mode = V2SImode;
   24330              :       gen_and = gen_andv2si3;
   24331              :       gen_pack = gen_mmx_packusdw;
   24332              :       gen_shift = gen_lshrv2si3;
   24333              :       pblendw_i = 0x5;
   24334              :       break;
   24335         5867 :     case E_V8HImode:
   24336              :       /* Required for "pack".  */
   24337         5867 :       if (!TARGET_SSE4_1)
   24338              :         return false;
   24339              :       c = 0xffff;
   24340              :       s = 16;
   24341              :       half_mode = V4SImode;
   24342              :       gen_and = gen_andv4si3;
   24343              :       gen_pack = gen_sse4_1_packusdw;
   24344              :       gen_shift = gen_lshrv4si3;
   24345              :       pblendw_i = 0x55;
   24346              :       break;
   24347              :     case E_V8QImode:
   24348              :       /* No check as all instructions are SSE2.  */
   24349              :       c = 0xff;
   24350              :       s = 8;
   24351              :       half_mode = V4HImode;
   24352              :       gen_and = gen_andv4hi3;
   24353              :       gen_pack = gen_mmx_packuswb;
   24354              :       gen_shift = gen_lshrv4hi3;
   24355              :       break;
   24356        14218 :     case E_V16QImode:
   24357              :       /* No check as all instructions are SSE2.  */
   24358        14218 :       c = 0xff;
   24359        14218 :       s = 8;
   24360        14218 :       half_mode = V8HImode;
   24361        14218 :       gen_and = gen_andv8hi3;
   24362        14218 :       gen_pack = gen_sse2_packuswb;
   24363        14218 :       gen_shift = gen_lshrv8hi3;
   24364        14218 :       break;
   24365          435 :     case E_V16HImode:
   24366          435 :       if (!TARGET_AVX2)
   24367              :         return false;
   24368              :       c = 0xffff;
   24369              :       s = 16;
   24370              :       half_mode = V8SImode;
   24371              :       gen_and = gen_andv8si3;
   24372              :       gen_pack = gen_avx2_packusdw;
   24373              :       gen_shift = gen_lshrv8si3;
   24374              :       pblendw_i = 0x5555;
   24375              :       end_perm = true;
   24376              :       break;
   24377          509 :     case E_V32QImode:
   24378          509 :       if (!TARGET_AVX2)
   24379              :         return false;
   24380              :       c = 0xff;
   24381              :       s = 8;
   24382              :       half_mode = V16HImode;
   24383              :       gen_and = gen_andv16hi3;
   24384              :       gen_pack = gen_avx2_packuswb;
   24385              :       gen_shift = gen_lshrv16hi3;
   24386              :       end_perm = true;
   24387              :       break;
   24388              :     default:
   24389              :       /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
   24390              :          are more profitable than general shuffles.  */
   24391              :       return false;
   24392              :     }
   24393              : 
   24394              :   /* Check that permutation is even or odd.  */
   24395        20089 :   odd = d->perm[0];
   24396        20089 :   if (odd > 1)
   24397              :     return false;
   24398              : 
   24399       229502 :   for (i = 1; i < nelt; ++i)
   24400       213652 :     if (d->perm[i] != 2 * i + odd)
   24401              :       return false;
   24402              : 
   24403        15850 :   if (d->testing_p)
   24404              :     return true;
   24405              : 
   24406         5516 :   dop0 = gen_reg_rtx (half_mode);
   24407         5516 :   dop1 = gen_reg_rtx (half_mode);
   24408         5516 :   if (odd == 0)
   24409              :     {
   24410              :       /* Use pblendw since const_vector 0 should be cheaper than
   24411              :          const_vector 0xffff.  */
   24412         4794 :       if (d->vmode == V4HImode
   24413              :           || d->vmode == E_V8HImode
   24414              :           || d->vmode == E_V16HImode)
   24415              :         {
   24416          864 :           rtx dop0_t = gen_reg_rtx (d->vmode);
   24417          864 :           rtx dop1_t = gen_reg_rtx (d->vmode);
   24418          864 :           t = gen_reg_rtx (d->vmode);
   24419          864 :           emit_move_insn (t, CONST0_RTX (d->vmode));
   24420              : 
   24421          864 :           emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
   24422              :                                                      GEN_INT (pblendw_i)));
   24423          864 :           emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
   24424              :                                                      GEN_INT (pblendw_i)));
   24425              : 
   24426          864 :           emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
   24427          864 :           emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
   24428          864 :         }
   24429              :       else
   24430              :         {
   24431         3930 :           t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
   24432         3930 :           t = force_reg (half_mode, t);
   24433         3930 :           emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
   24434         3930 :           emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
   24435              :         }
   24436              :     }
   24437              :   else
   24438              :     {
   24439         1444 :       emit_insn (gen_shift (dop0,
   24440          722 :                             gen_lowpart (half_mode, d->op0),
   24441              :                             GEN_INT (s)));
   24442         1444 :       emit_insn (gen_shift (dop1,
   24443          722 :                             gen_lowpart (half_mode, d->op1),
   24444              :                             GEN_INT (s)));
   24445              :     }
   24446              :   /* In AVX2 for 256 bit case we need to permute pack result.  */
   24447         5516 :   if (TARGET_AVX2 && end_perm)
   24448              :     {
   24449          411 :       op = gen_reg_rtx (d->vmode);
   24450          411 :       t = gen_reg_rtx (V4DImode);
   24451          411 :       emit_insn (gen_pack (op, dop0, dop1));
   24452          822 :       emit_insn (gen_avx2_permv4di_1 (t,
   24453          411 :                                       gen_lowpart (V4DImode, op),
   24454              :                                       const0_rtx,
   24455              :                                       const2_rtx,
   24456              :                                       const1_rtx,
   24457              :                                       GEN_INT (3)));
   24458          411 :       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
   24459              :     }
   24460              :   else
   24461         5105 :     emit_insn (gen_pack (d->target, dop0, dop1));
   24462              : 
   24463              :   return true;
   24464              : }
   24465              : 
   24466              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24467              :    and extract-odd permutations of two V64QI operands
   24468              :    with two "shifts", two "truncs" and one "concat" insns for "odd"
   24469              :    and two "truncs" and one concat insn for "even."
   24470              :    Have already failed all two instruction sequences.  */
   24471              : 
   24472              : static bool
   24473        23672 : expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
   24474              : {
   24475        23672 :   rtx t1, t2, t3, t4;
   24476        23672 :   unsigned i, odd, nelt = d->nelt;
   24477              : 
   24478        23672 :   if (!TARGET_AVX512BW
   24479          106 :       || d->one_operand_p
   24480           70 :       || d->vmode != V64QImode)
   24481              :     return false;
   24482              : 
   24483              :   /* Check that permutation is even or odd.  */
   24484           70 :   odd = d->perm[0];
   24485           70 :   if (odd > 1)
   24486              :     return false;
   24487              : 
   24488         2422 :   for (i = 1; i < nelt; ++i)
   24489         2388 :     if (d->perm[i] != 2 * i + odd)
   24490              :       return false;
   24491              : 
   24492           34 :   if (d->testing_p)
   24493              :     return true;
   24494              : 
   24495              : 
   24496           34 :   if (odd)
   24497              :     {
   24498            5 :       t1 = gen_reg_rtx (V32HImode);
   24499            5 :       t2 = gen_reg_rtx (V32HImode);
   24500           10 :       emit_insn (gen_lshrv32hi3 (t1,
   24501            5 :                                  gen_lowpart (V32HImode, d->op0),
   24502              :                                  GEN_INT (8)));
   24503           10 :       emit_insn (gen_lshrv32hi3 (t2,
   24504            5 :                                  gen_lowpart (V32HImode, d->op1),
   24505              :                                  GEN_INT (8)));
   24506              :     }
   24507              :   else
   24508              :     {
   24509           29 :       t1 = gen_lowpart (V32HImode, d->op0);
   24510           29 :       t2 = gen_lowpart (V32HImode, d->op1);
   24511              :     }
   24512              : 
   24513           34 :   t3 = gen_reg_rtx (V32QImode);
   24514           34 :   t4 = gen_reg_rtx (V32QImode);
   24515           34 :   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
   24516           34 :   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
   24517           34 :   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
   24518              : 
   24519           34 :   return true;
   24520              : }
   24521              : 
   24522              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
   24523              :    and extract-odd permutations.  */
   24524              : 
   24525              : static bool
   24526        12576 : expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
   24527              : {
   24528        12576 :   rtx t1, t2, t3, t4, t5;
   24529              : 
   24530        12576 :   switch (d->vmode)
   24531              :     {
   24532           19 :     case E_V4DFmode:
   24533           19 :       if (d->testing_p)
   24534              :         break;
   24535            1 :       t1 = gen_reg_rtx (V4DFmode);
   24536            1 :       t2 = gen_reg_rtx (V4DFmode);
   24537              : 
   24538              :       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
   24539            1 :       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
   24540            1 :       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
   24541              : 
   24542              :       /* Now an unpck[lh]pd will produce the result required.  */
   24543            1 :       if (odd)
   24544            0 :         t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
   24545              :       else
   24546            1 :         t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
   24547            1 :       emit_insn (t3);
   24548            1 :       break;
   24549              : 
   24550         1214 :     case E_V8SFmode:
   24551         1214 :       {
   24552         1214 :         int mask = odd ? 0xdd : 0x88;
   24553              : 
   24554         1214 :         if (d->testing_p)
   24555              :           break;
   24556          186 :         t1 = gen_reg_rtx (V8SFmode);
   24557          186 :         t2 = gen_reg_rtx (V8SFmode);
   24558          186 :         t3 = gen_reg_rtx (V8SFmode);
   24559              : 
   24560              :         /* Shuffle within the 128-bit lanes to produce:
   24561              :            { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
   24562          186 :         emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
   24563              :                                       GEN_INT (mask)));
   24564              : 
   24565              :         /* Shuffle the lanes around to produce:
   24566              :            { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
   24567          186 :         emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
   24568              :                                             GEN_INT (0x3)));
   24569              : 
   24570              :         /* Shuffle within the 128-bit lanes to produce:
   24571              :            { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
   24572          186 :         emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
   24573              : 
   24574              :         /* Shuffle within the 128-bit lanes to produce:
   24575              :            { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
   24576          186 :         emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
   24577              : 
   24578              :         /* Shuffle the lanes around to produce:
   24579              :            { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
   24580          186 :         emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
   24581              :                                             GEN_INT (0x20)));
   24582              :       }
   24583          186 :       break;
   24584              : 
   24585            0 :     case E_V2DFmode:
   24586            0 :     case E_V4SFmode:
   24587            0 :     case E_V2DImode:
   24588            0 :     case E_V2SImode:
   24589            0 :     case E_V4SImode:
   24590            0 :     case E_V2HImode:
   24591              :       /* These are always directly implementable by expand_vec_perm_1.  */
   24592            0 :       gcc_unreachable ();
   24593              : 
   24594            0 :     case E_V2SFmode:
   24595            0 :       gcc_assert (TARGET_MMX_WITH_SSE);
   24596              :       /* We have no suitable instructions.  */
   24597            0 :       if (d->testing_p)
   24598              :         return false;
   24599              :       break;
   24600              : 
   24601         1412 :     case E_V4QImode:
   24602         1412 :       if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24603            0 :         return expand_vec_perm_pshufb2 (d);
   24604              :       else
   24605              :         {
   24606         1412 :           if (d->testing_p)
   24607              :             break;
   24608              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24609              :              with interleave. */
   24610          178 :           t1 = gen_reg_rtx (V4QImode);
   24611          178 :           emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
   24612          178 :           emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
   24613          178 :           if (odd)
   24614           41 :             t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
   24615              :           else
   24616          137 :             t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
   24617          178 :           emit_insn (t2);
   24618              :         }
   24619          178 :       break;
   24620              : 
   24621         1320 :     case E_V4HImode:
   24622         1320 :       if (TARGET_SSE4_1)
   24623           90 :         return expand_vec_perm_even_odd_pack (d);
   24624         1230 :       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24625           20 :         return expand_vec_perm_pshufb2 (d);
   24626              :       else
   24627              :         {
   24628         1210 :           if (d->testing_p)
   24629              :             break;
   24630              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24631              :              with interleave. */
   24632          496 :           t1 = gen_reg_rtx (V4HImode);
   24633          496 :           emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
   24634          496 :           emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
   24635          496 :           if (odd)
   24636            8 :             t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
   24637              :           else
   24638          488 :             t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
   24639          496 :           emit_insn (t2);
   24640              :         }
   24641          496 :       break;
   24642              : 
   24643         6585 :     case E_V8HImode:
   24644         6585 :       if (TARGET_SSE4_1)
   24645          439 :         return expand_vec_perm_even_odd_pack (d);
   24646         6146 :       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24647            1 :         return expand_vec_perm_pshufb2 (d);
   24648              :       else
   24649              :         {
   24650         6145 :           if (d->testing_p)
   24651              :             break;
   24652              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24653              :              with interleave. */
   24654         2716 :           t1 = gen_reg_rtx (V8HImode);
   24655         2716 :           t2 = gen_reg_rtx (V8HImode);
   24656         2716 :           emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
   24657         2716 :           emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
   24658         2716 :           emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
   24659         2716 :           emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
   24660         2716 :           if (odd)
   24661           92 :             t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
   24662              :           else
   24663         2624 :             t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
   24664         2716 :           emit_insn (t3);
   24665              :         }
   24666         2716 :       break;
   24667              : 
   24668         1339 :     case E_V8QImode:
   24669         1339 :     case E_V16QImode:
   24670         1339 :       return expand_vec_perm_even_odd_pack (d);
   24671              : 
   24672          456 :     case E_V16HImode:
   24673          456 :     case E_V32QImode:
   24674          456 :       return expand_vec_perm_even_odd_pack (d);
   24675              : 
   24676           36 :     case E_V64QImode:
   24677           36 :       return expand_vec_perm_even_odd_trunc (d);
   24678              : 
   24679           19 :     case E_V4DImode:
   24680           19 :       if (!TARGET_AVX2)
   24681              :         {
   24682           19 :           struct expand_vec_perm_d d_copy = *d;
   24683           19 :           d_copy.vmode = V4DFmode;
   24684           19 :           if (d->testing_p)
   24685           18 :             d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
   24686              :           else
   24687            1 :             d_copy.target = gen_reg_rtx (V4DFmode);
   24688           19 :           d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
   24689           19 :           d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
   24690           19 :           if (expand_vec_perm_even_odd_1 (&d_copy, odd))
   24691              :             {
   24692           19 :               if (!d->testing_p)
   24693            1 :                 emit_move_insn (d->target,
   24694            1 :                                 gen_lowpart (V4DImode, d_copy.target));
   24695           19 :               return true;
   24696              :             }
   24697              :           return false;
   24698              :         }
   24699              : 
   24700            0 :       if (d->testing_p)
   24701              :         break;
   24702              : 
   24703            0 :       t1 = gen_reg_rtx (V4DImode);
   24704            0 :       t2 = gen_reg_rtx (V4DImode);
   24705              : 
   24706              :       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
   24707            0 :       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
   24708            0 :       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
   24709              : 
   24710              :       /* Now an vpunpck[lh]qdq will produce the result required.  */
   24711            0 :       if (odd)
   24712            0 :         t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
   24713              :       else
   24714            0 :         t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
   24715            0 :       emit_insn (t3);
   24716            0 :       break;
   24717              : 
   24718          176 :     case E_V8SImode:
   24719          176 :       if (!TARGET_AVX2)
   24720              :         {
   24721           38 :           struct expand_vec_perm_d d_copy = *d;
   24722           38 :           d_copy.vmode = V8SFmode;
   24723           38 :           if (d->testing_p)
   24724           38 :             d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
   24725              :           else
   24726            0 :             d_copy.target = gen_reg_rtx (V8SFmode);
   24727           38 :           d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
   24728           38 :           d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
   24729           38 :           if (expand_vec_perm_even_odd_1 (&d_copy, odd))
   24730              :             {
   24731           38 :               if (!d->testing_p)
   24732            0 :                 emit_move_insn (d->target,
   24733            0 :                                 gen_lowpart (V8SImode, d_copy.target));
   24734           38 :               return true;
   24735              :             }
   24736              :           return false;
   24737              :         }
   24738              : 
   24739          138 :       if (d->testing_p)
   24740              :         break;
   24741              : 
   24742          138 :       t1 = gen_reg_rtx (V8SImode);
   24743          138 :       t2 = gen_reg_rtx (V8SImode);
   24744          138 :       t3 = gen_reg_rtx (V4DImode);
   24745          138 :       t4 = gen_reg_rtx (V4DImode);
   24746          138 :       t5 = gen_reg_rtx (V4DImode);
   24747              : 
   24748              :       /* Shuffle the lanes around into
   24749              :          { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
   24750          276 :       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
   24751          138 :                                     gen_lowpart (V4DImode, d->op1),
   24752              :                                     GEN_INT (0x20)));
   24753          276 :       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
   24754          138 :                                     gen_lowpart (V4DImode, d->op1),
   24755              :                                     GEN_INT (0x31)));
   24756              : 
   24757              :       /* Swap the 2nd and 3rd position in each lane into
   24758              :          { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
   24759          138 :       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
   24760              :                                     GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
   24761          138 :       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
   24762              :                                     GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
   24763              : 
   24764              :       /* Now an vpunpck[lh]qdq will produce
   24765              :          { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
   24766          138 :       if (odd)
   24767            0 :         t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
   24768            0 :                                            gen_lowpart (V4DImode, t2));
   24769              :       else
   24770          138 :         t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
   24771          138 :                                           gen_lowpart (V4DImode, t2));
   24772          138 :       emit_insn (t3);
   24773          138 :       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
   24774          138 :       break;
   24775              : 
   24776            0 :     default:
   24777            0 :       gcc_unreachable ();
   24778              :     }
   24779              : 
   24780              :   return true;
   24781              : }
   24782              : 
   24783              : /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
   24784              :    extract-even and extract-odd permutations.  */
   24785              : 
   24786              : static bool
   24787        23545 : expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
   24788              : {
   24789        23545 :   unsigned i, odd, nelt = d->nelt;
   24790              : 
   24791        23545 :   odd = d->perm[0];
   24792        23545 :   if (odd != 0 && odd != 1)
   24793              :     return false;
   24794              : 
   24795        63645 :   for (i = 1; i < nelt; ++i)
   24796        56143 :     if (d->perm[i] != 2 * i + odd)
   24797              :       return false;
   24798              : 
   24799         7502 :   if (d->vmode == E_V32HImode
   24800           12 :       && d->testing_p
   24801           12 :       && !TARGET_AVX512BW)
   24802              :     return false;
   24803              : 
   24804         7490 :   return expand_vec_perm_even_odd_1 (d, odd);
   24805              : }
   24806              : 
   24807              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
   24808              :    permutations.  We assume that expand_vec_perm_1 has already failed.  */
   24809              : 
   24810              : static bool
   24811         1039 : expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
   24812              : {
   24813         1039 :   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
   24814         1039 :   machine_mode vmode = d->vmode;
   24815         1039 :   rtx (*gen) (rtx, rtx, rtx);
   24816         1039 :   unsigned char perm2[4];
   24817         1039 :   rtx op0 = d->op0, dest;
   24818         1039 :   bool ok;
   24819              : 
   24820         1039 :   switch (vmode)
   24821              :     {
   24822            0 :     case E_V4DFmode:
   24823            0 :     case E_V8SFmode:
   24824              :       /* These are special-cased in sse.md so that we can optionally
   24825              :          use the vbroadcast instruction.  They expand to two insns
   24826              :          if the input happens to be in a register.  */
   24827            0 :       gcc_unreachable ();
   24828              : 
   24829            0 :     case E_V2DFmode:
   24830            0 :     case E_V2SFmode:
   24831            0 :     case E_V4SFmode:
   24832            0 :     case E_V2DImode:
   24833            0 :     case E_V2SImode:
   24834            0 :     case E_V4SImode:
   24835            0 :     case E_V2HImode:
   24836            0 :     case E_V4HImode:
   24837              :       /* These are always implementable using standard shuffle patterns.  */
   24838            0 :       gcc_unreachable ();
   24839              : 
   24840           16 :     case E_V4QImode:
   24841              :       /* This can be implemented via interleave and pshuflw.  */
   24842           16 :       if (d->testing_p)
   24843              :         return true;
   24844              : 
   24845            8 :       if (elt >= nelt2)
   24846              :         {
   24847            4 :           gen = gen_mmx_punpckhbw_low;
   24848            4 :           elt -= nelt2;
   24849              :         }
   24850              :       else
   24851              :         gen = gen_mmx_punpcklbw_low;
   24852              : 
   24853            8 :       dest = gen_reg_rtx (vmode);
   24854            8 :       emit_insn (gen (dest, op0, op0));
   24855            8 :       vmode = get_mode_wider_vector (vmode);
   24856            8 :       op0 = gen_lowpart (vmode, dest);
   24857              : 
   24858            8 :       memset (perm2, elt, 2);
   24859            8 :       dest = gen_reg_rtx (vmode);
   24860            8 :       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
   24861            8 :       gcc_assert (ok);
   24862              : 
   24863            8 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24864            8 :       return true;
   24865              : 
   24866            4 :     case E_V8QImode:
   24867              :       /* This can be implemented via interleave.  We save one insn by
   24868              :          stopping once we have promoted to V2SImode and then use pshufd.  */
   24869            4 :       if (d->testing_p)
   24870              :         return true;
   24871            4 :       do
   24872              :         {
   24873            4 :           if (elt >= nelt2)
   24874              :             {
   24875            1 :               gen = vmode == V8QImode ? gen_mmx_punpckhbw
   24876              :                                       : gen_mmx_punpckhwd;
   24877            1 :               elt -= nelt2;
   24878              :             }
   24879              :           else
   24880            3 :             gen = vmode == V8QImode ? gen_mmx_punpcklbw
   24881              :                                     : gen_mmx_punpcklwd;
   24882            4 :           nelt2 /= 2;
   24883              : 
   24884            4 :           dest = gen_reg_rtx (vmode);
   24885            4 :           emit_insn (gen (dest, op0, op0));
   24886            4 :           vmode = get_mode_wider_vector (vmode);
   24887            4 :           op0 = gen_lowpart (vmode, dest);
   24888              :         }
   24889            4 :       while (vmode != V2SImode);
   24890              : 
   24891            2 :       memset (perm2, elt, 2);
   24892            2 :       dest = gen_reg_rtx (vmode);
   24893            2 :       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
   24894            2 :       gcc_assert (ok);
   24895              : 
   24896            2 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24897            2 :       return true;
   24898              : 
   24899         1010 :     case E_V8HImode:
   24900         1010 :     case E_V16QImode:
   24901              :       /* These can be implemented via interleave.  We save one insn by
   24902              :          stopping once we have promoted to V4SImode and then use pshufd.  */
   24903         1010 :       if (d->testing_p)
   24904              :         return true;
   24905         1550 :       do
   24906              :         {
   24907         1550 :           if (elt >= nelt2)
   24908              :             {
   24909           16 :               gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
   24910              :                                        : gen_vec_interleave_highv8hi;
   24911           16 :               elt -= nelt2;
   24912              :             }
   24913              :           else
   24914         1534 :             gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
   24915              :                                      : gen_vec_interleave_lowv8hi;
   24916         1550 :           nelt2 /= 2;
   24917              : 
   24918         1550 :           dest = gen_reg_rtx (vmode);
   24919         1550 :           emit_insn (gen (dest, op0, op0));
   24920         1550 :           vmode = get_mode_wider_vector (vmode);
   24921         1550 :           op0 = gen_lowpart (vmode, dest);
   24922              :         }
   24923         1550 :       while (vmode != V4SImode);
   24924              : 
   24925          945 :       memset (perm2, elt, 4);
   24926          945 :       dest = gen_reg_rtx (vmode);
   24927          945 :       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
   24928          945 :       gcc_assert (ok);
   24929              : 
   24930          945 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24931          945 :       return true;
   24932              : 
   24933            1 :     case E_V8HFmode:
   24934            1 :     case E_V8BFmode:
   24935              :       /* This can be implemented via interleave and pshufd.  */
   24936            1 :       if (d->testing_p)
   24937              :         return true;
   24938              : 
   24939            1 :       rtx (*gen_interleave) (machine_mode, rtx, rtx, rtx);
   24940            1 :       if (elt >= nelt2)
   24941              :         {
   24942            0 :           gen_interleave = gen_vec_interleave_high;
   24943            0 :           elt -= nelt2;
   24944              :         }
   24945              :       else
   24946              :         gen_interleave = gen_vec_interleave_low;
   24947            1 :       nelt2 /= 2;
   24948              : 
   24949            1 :       dest = gen_reg_rtx (vmode);
   24950            1 :       emit_insn (gen_interleave (vmode, dest, op0, op0));
   24951              : 
   24952            1 :       vmode = V4SImode;
   24953            1 :       op0 = gen_lowpart (vmode, dest);
   24954              : 
   24955            1 :       memset (perm2, elt, 4);
   24956            1 :       dest = gen_reg_rtx (vmode);
   24957            1 :       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
   24958            1 :       gcc_assert (ok);
   24959              : 
   24960            1 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24961            1 :       return true;
   24962              : 
   24963            0 :     case E_V32QImode:
   24964            0 :     case E_V16HImode:
   24965            0 :     case E_V8SImode:
   24966            0 :     case E_V4DImode:
   24967              :       /* For AVX2 broadcasts of the first element vpbroadcast* or
   24968              :          vpermq should be used by expand_vec_perm_1.  */
   24969            0 :       gcc_assert (!TARGET_AVX2 || d->perm[0]);
   24970              :       return false;
   24971              : 
   24972            6 :     case E_V64QImode:
   24973            6 :       gcc_assert (!TARGET_AVX512BW || d->perm[0]);
   24974              :       return false;
   24975              : 
   24976            2 :     case E_V32HImode:
   24977            2 :       gcc_assert (!TARGET_AVX512BW);
   24978              :       return false;
   24979              : 
   24980            0 :     default:
   24981            0 :       gcc_unreachable ();
   24982              :     }
   24983              : }
   24984              : 
   24985              : /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
   24986              :    broadcast permutations.  */
   24987              : 
   24988              : static bool
   24989        88997 : expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
   24990              : {
   24991        88997 :   unsigned i, elt, nelt = d->nelt;
   24992              : 
   24993        88997 :   if (!d->one_operand_p)
   24994              :     return false;
   24995              : 
   24996         5385 :   elt = d->perm[0];
   24997         8271 :   for (i = 1; i < nelt; ++i)
   24998         8162 :     if (d->perm[i] != elt)
   24999              :       return false;
   25000              : 
   25001          109 :   return expand_vec_perm_broadcast_1 (d);
   25002              : }
   25003              : 
   25004              : /* Implement arbitrary permutations of two V64QImode operands
   25005              :    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
   25006              : static bool
   25007        23617 : expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
   25008              : {
   25009        23617 :   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
   25010              :     return false;
   25011              : 
   25012           72 :   if (d->testing_p)
   25013              :     return true;
   25014              : 
   25015           72 :   struct expand_vec_perm_d ds[2];
   25016           72 :   rtx rperm[128], vperm, target0, target1;
   25017           72 :   unsigned int i, nelt;
   25018           72 :   machine_mode vmode;
   25019              : 
   25020           72 :   nelt = d->nelt;
   25021           72 :   vmode = V64QImode;
   25022              : 
   25023          216 :   for (i = 0; i < 2; i++)
   25024              :     {
   25025          144 :       ds[i] = *d;
   25026          144 :       ds[i].vmode = V32HImode;
   25027          144 :       ds[i].nelt = 32;
   25028          144 :       ds[i].target = gen_reg_rtx (V32HImode);
   25029          144 :       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
   25030          144 :       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
   25031              :     }
   25032              : 
   25033              :   /* Prepare permutations such that the first one takes care of
   25034              :      putting the even bytes into the right positions or one higher
   25035              :      positions (ds[0]) and the second one takes care of
   25036              :      putting the odd bytes into the right positions or one below
   25037              :      (ds[1]).  */
   25038              : 
   25039         4680 :   for (i = 0; i < nelt; i++)
   25040              :     {
   25041         4608 :       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
   25042         4608 :       if (i & 1)
   25043              :         {
   25044         2304 :           rperm[i] = constm1_rtx;
   25045         2304 :           rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
   25046              :         }
   25047              :       else
   25048              :         {
   25049         2304 :           rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
   25050         2304 :           rperm[i + 64] = constm1_rtx;
   25051              :         }
   25052              :     }
   25053              : 
   25054           72 :   bool ok = expand_vec_perm_1 (&ds[0]);
   25055           72 :   gcc_assert (ok);
   25056           72 :   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
   25057              : 
   25058           72 :   ok = expand_vec_perm_1 (&ds[1]);
   25059           72 :   gcc_assert (ok);
   25060           72 :   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
   25061              : 
   25062           72 :   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
   25063           72 :   vperm = force_reg (vmode, vperm);
   25064           72 :   target0 = gen_reg_rtx (V64QImode);
   25065           72 :   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
   25066              : 
   25067           72 :   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
   25068           72 :   vperm = force_reg (vmode, vperm);
   25069           72 :   target1 = gen_reg_rtx (V64QImode);
   25070           72 :   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
   25071              : 
   25072           72 :   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
   25073           72 :   return true;
   25074              : }
   25075              : 
   25076              : /* Implement arbitrary permutation of two V32QImode and V16QImode operands
   25077              :    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
   25078              :    all the shorter instruction sequences.  */
   25079              : 
   25080              : static bool
   25081        15844 : expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
   25082              : {
   25083        15844 :   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
   25084        15844 :   unsigned int i, nelt, eltsz;
   25085        15844 :   bool used[4];
   25086              : 
   25087        15844 :   if (!TARGET_AVX2
   25088          322 :       || d->one_operand_p
   25089          193 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   25090              :     return false;
   25091              : 
   25092           54 :   if (d->testing_p)
   25093              :     return true;
   25094              : 
   25095           54 :   nelt = d->nelt;
   25096           54 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   25097              : 
   25098              :   /* Generate 4 permutation masks.  If the required element is within
   25099              :      the same lane, it is shuffled in.  If the required element from the
   25100              :      other lane, force a zero by setting bit 7 in the permutation mask.
   25101              :      In the other mask the mask has non-negative elements if element
   25102              :      is requested from the other lane, but also moved to the other lane,
   25103              :      so that the result of vpshufb can have the two V2TImode halves
   25104              :      swapped.  */
   25105           54 :   m128 = GEN_INT (-128);
   25106         1836 :   for (i = 0; i < 32; ++i)
   25107              :     {
   25108         1728 :       rperm[0][i] = m128;
   25109         1728 :       rperm[1][i] = m128;
   25110         1728 :       rperm[2][i] = m128;
   25111         1728 :       rperm[3][i] = m128;
   25112              :     }
   25113           54 :   used[0] = false;
   25114           54 :   used[1] = false;
   25115           54 :   used[2] = false;
   25116           54 :   used[3] = false;
   25117         1590 :   for (i = 0; i < nelt; ++i)
   25118              :     {
   25119         1536 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   25120         1536 :       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
   25121         2074 :       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
   25122              : 
   25123         3264 :       for (j = 0; j < eltsz; ++j)
   25124         1728 :         rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
   25125         1536 :       used[which] = true;
   25126              :     }
   25127              : 
   25128          162 :   for (i = 0; i < 2; ++i)
   25129              :     {
   25130          108 :       if (!used[2 * i + 1])
   25131              :         {
   25132           22 :           h[i] = NULL_RTX;
   25133           22 :           continue;
   25134              :         }
   25135           86 :       vperm = gen_rtx_CONST_VECTOR (V32QImode,
   25136           86 :                                     gen_rtvec_v (32, rperm[2 * i + 1]));
   25137           86 :       vperm = force_reg (V32QImode, vperm);
   25138           86 :       h[i] = gen_reg_rtx (V32QImode);
   25139           86 :       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
   25140           86 :       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
   25141              :     }
   25142              : 
   25143              :   /* Swap the 128-byte lanes of h[X].  */
   25144          162 :   for (i = 0; i < 2; ++i)
   25145              :    {
   25146          108 :      if (h[i] == NULL_RTX)
   25147           22 :        continue;
   25148           86 :      op = gen_reg_rtx (V4DImode);
   25149           86 :      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
   25150              :                                      const2_rtx, GEN_INT (3), const0_rtx,
   25151              :                                      const1_rtx));
   25152           86 :      h[i] = gen_lowpart (V32QImode, op);
   25153              :    }
   25154              : 
   25155          162 :   for (i = 0; i < 2; ++i)
   25156              :     {
   25157          108 :       if (!used[2 * i])
   25158              :         {
   25159            0 :           l[i] = NULL_RTX;
   25160            0 :           continue;
   25161              :         }
   25162          108 :       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
   25163          108 :       vperm = force_reg (V32QImode, vperm);
   25164          108 :       l[i] = gen_reg_rtx (V32QImode);
   25165          108 :       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
   25166          108 :       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
   25167              :     }
   25168              : 
   25169          162 :   for (i = 0; i < 2; ++i)
   25170              :     {
   25171          108 :       if (h[i] && l[i])
   25172              :         {
   25173           86 :           op = gen_reg_rtx (V32QImode);
   25174           86 :           emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
   25175           86 :           l[i] = op;
   25176              :         }
   25177           22 :       else if (h[i])
   25178            0 :         l[i] = h[i];
   25179              :     }
   25180              : 
   25181           54 :   gcc_assert (l[0] && l[1]);
   25182           54 :   op = d->target;
   25183           54 :   if (d->vmode != V32QImode)
   25184           12 :     op = gen_reg_rtx (V32QImode);
   25185           54 :   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
   25186           54 :   if (op != d->target)
   25187           12 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   25188              :   return true;
   25189              : }
   25190              : 
   25191              : /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
   25192              :    taken care of, perform the expansion in D and return true on success.  */
   25193              : 
   25194              : static bool
   25195       306628 : ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   25196              : {
   25197              :   /* Try a single instruction expansion.  */
   25198       306628 :   if (expand_vec_perm_1 (d))
   25199              :     return true;
   25200              : 
   25201              :   /* Try sequences of two instructions.  */
   25202              : 
   25203       100982 :   if (expand_vec_perm_pshuflw_pshufhw (d))
   25204              :     return true;
   25205              : 
   25206        98519 :   if (expand_vec_perm_palignr (d, false))
   25207              :     return true;
   25208              : 
   25209        95384 :   if (expand_vec_perm_interleave2 (d))
   25210              :     return true;
   25211              : 
   25212        88997 :   if (expand_vec_perm_broadcast (d))
   25213              :     return true;
   25214              : 
   25215        88896 :   if (expand_vec_perm_vpermq_perm_1 (d))
   25216              :     return true;
   25217              : 
   25218        88896 :   if (expand_vec_perm_vperm2f128 (d))
   25219              :     return true;
   25220              : 
   25221        88832 :   if (expand_vec_perm_pblendv (d))
   25222              :     return true;
   25223              : 
   25224        87076 :   if (expand_vec_perm_2perm_interleave (d, true))
   25225              :     return true;
   25226              : 
   25227        86714 :   if (expand_vec_perm_2perm_pblendv (d, true))
   25228              :     return true;
   25229              : 
   25230        83619 :   if (expand_vec_perm_shufps_shufps (d))
   25231              :     return true;
   25232              : 
   25233        48857 :   if (expand_vec_perm_punpckldq_pshuf (d))
   25234              :     return true;
   25235              : 
   25236              :   /* Try sequences of three instructions.  */
   25237              : 
   25238        43583 :   if (expand_vec_perm_even_odd_pack (d))
   25239              :     return true;
   25240              : 
   25241        29997 :   if (expand_vec_perm_2vperm2f128_vshuf (d))
   25242              :     return true;
   25243              : 
   25244        28720 :   if (expand_vec_perm_pshufb2 (d))
   25245              :     return true;
   25246              : 
   25247        27669 :   if (expand_vec_perm_pslldq_psrldq_por (d, false))
   25248              :     return true;
   25249              : 
   25250        27426 :   if (expand_vec_perm_interleave3 (d))
   25251              :     return true;
   25252              : 
   25253        27288 :   if (expand_vec_perm_vperm2f128_vblend (d))
   25254              :     return true;
   25255              : 
   25256        27288 :   if (expand_vec_perm_2perm_interleave (d, false))
   25257              :     return true;
   25258              : 
   25259        27048 :   if (expand_vec_perm_2perm_pblendv (d, false))
   25260              :     return true;
   25261              : 
   25262        26148 :   if (expand_vec_perm_psrlw_psllw_por (d))
   25263              :     return true;
   25264              : 
   25265        24710 :   if (expand_vec_perm_pand_pandn_por (d))
   25266              :     return true;
   25267              : 
   25268              :   /* Try sequences of four instructions.  */
   25269              : 
   25270        23636 :   if (expand_vec_perm_even_odd_trunc (d))
   25271              :     return true;
   25272        23624 :   if (expand_vec_perm_vpshufb2_vpermq (d))
   25273              :     return true;
   25274              : 
   25275        23617 :   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
   25276              :     return true;
   25277              : 
   25278        23617 :   if (expand_vec_perm_vpermt2_vpshub2 (d))
   25279              :     return true;
   25280              : 
   25281              :   /* ??? Look for narrow permutations whose element orderings would
   25282              :      allow the promotion to a wider mode.  */
   25283              : 
   25284              :   /* ??? Look for sequences of interleave or a wider permute that place
   25285              :      the data into the correct lanes for a half-vector shuffle like
   25286              :      pshuf[lh]w or vpermilps.  */
   25287              : 
   25288              :   /* ??? Look for sequences of interleave that produce the desired results.
   25289              :      The combinatorics of punpck[lh] get pretty ugly... */
   25290              : 
   25291        23545 :   if (expand_vec_perm_even_odd (d))
   25292              :     return true;
   25293              : 
   25294              :   /* Generate four or five instructions.  */
   25295        16129 :   if (expand_vec_perm_pslldq_psrldq_por (d, true))
   25296              :     return true;
   25297              : 
   25298              :   /* Even longer sequences.  */
   25299        15844 :   if (expand_vec_perm_vpshufb4_vpermq2 (d))
   25300              :     return true;
   25301              : 
   25302              :   /* See if we can get the same permutation in different vector integer
   25303              :      mode.  */
   25304        15790 :   struct expand_vec_perm_d nd;
   25305        15790 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   25306              :     {
   25307            0 :       if (!d->testing_p)
   25308            0 :         emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   25309            0 :       return true;
   25310              :     }
   25311              : 
   25312              :   /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
   25313        15790 :   if (expand_vec_perm2_vperm2f128_vblend (d))
   25314              :     return true;
   25315              : 
   25316              :   return false;
   25317              : }
   25318              : 
   25319              : /* If a permutation only uses one operand, make it clear. Returns true
   25320              :    if the permutation references both operands.  */
   25321              : 
   25322              : static bool
   25323        74796 : canonicalize_perm (struct expand_vec_perm_d *d)
   25324              : {
   25325        74796 :   int i, which, nelt = d->nelt;
   25326              : 
   25327       450922 :   for (i = which = 0; i < nelt; ++i)
   25328       511191 :     which |= (d->perm[i] < nelt ? 1 : 2);
   25329              : 
   25330        74796 :   d->one_operand_p = true;
   25331        74796 :   switch (which)
   25332              :     {
   25333            0 :     default:
   25334            0 :       gcc_unreachable();
   25335              : 
   25336        55750 :     case 3:
   25337        55750 :       if (!rtx_equal_p (d->op0, d->op1))
   25338              :         {
   25339        55699 :           d->one_operand_p = false;
   25340        55699 :           break;
   25341              :         }
   25342              :       /* The elements of PERM do not suggest that only the first operand
   25343              :          is used, but both operands are identical.  Allow easier matching
   25344              :          of the permutation by folding the permutation into the single
   25345              :          input vector.  */
   25346              :       /* FALLTHRU */
   25347              : 
   25348              :     case 2:
   25349         2913 :       for (i = 0; i < nelt; ++i)
   25350         2576 :         d->perm[i] &= nelt - 1;
   25351          337 :       d->op0 = d->op1;
   25352          337 :       break;
   25353              : 
   25354        18760 :     case 1:
   25355        18760 :       d->op1 = d->op0;
   25356        18760 :       break;
   25357              :     }
   25358              : 
   25359        74796 :   return (which == 3);
   25360              : }
   25361              : 
   25362              : /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
   25363              : 
   25364              : bool
   25365       823314 : ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   25366              :                                rtx target, rtx op0, rtx op1,
   25367              :                                const vec_perm_indices &sel)
   25368              : {
   25369       823314 :   if (vmode != op_mode)
   25370              :     return false;
   25371              : 
   25372       821449 :   struct expand_vec_perm_d d;
   25373       821449 :   unsigned char perm[MAX_VECT_LEN];
   25374       821449 :   unsigned int i, nelt, which;
   25375       821449 :   bool two_args;
   25376              : 
   25377              :   /* For HF and BF mode vector, convert it to HI using subreg.  */
   25378      2463889 :   if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
   25379              :     {
   25380          484 :       machine_mode orig_mode = vmode;
   25381          968 :       vmode = mode_for_vector (HImode,
   25382          484 :                                GET_MODE_NUNITS (vmode)).require ();
   25383          484 :       if (target)
   25384          441 :         target = lowpart_subreg (vmode, target, orig_mode);
   25385          484 :       if (op0)
   25386          441 :         op0 = lowpart_subreg (vmode, op0, orig_mode);
   25387          484 :       if (op1)
   25388          441 :         op1 = lowpart_subreg (vmode, op1, orig_mode);
   25389              :     }
   25390              : 
   25391       821449 :   d.target = target;
   25392       821449 :   d.op0 = op0;
   25393       821449 :   d.op1 = op1;
   25394              : 
   25395       821449 :   d.vmode = vmode;
   25396       821449 :   gcc_assert (VECTOR_MODE_P (d.vmode));
   25397       821449 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25398       821449 :   d.testing_p = !target;
   25399              : 
   25400       821449 :   gcc_assert (sel.length () == nelt);
   25401       821449 :   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
   25402              : 
   25403              :   /* Given sufficient ISA support we can just return true here
   25404              :      for selected vector modes.  */
   25405       821449 :   switch (d.vmode)
   25406              :     {
   25407         2047 :     case E_V16SFmode:
   25408         2047 :     case E_V16SImode:
   25409         2047 :     case E_V8DImode:
   25410         2047 :     case E_V8DFmode:
   25411         2047 :       if (!TARGET_AVX512F)
   25412              :         return false;
   25413              :       /* All implementable with a single vperm[it]2 insn.  */
   25414         2047 :       if (d.testing_p)
   25415              :         return true;
   25416              :       break;
   25417          323 :     case E_V32HImode:
   25418          323 :       if (!TARGET_AVX512F)
   25419              :         return false;
   25420          323 :       if (d.testing_p && TARGET_AVX512BW)
   25421              :         /* All implementable with a single vperm[it]2 insn.  */
   25422              :         return true;
   25423              :       break;
   25424          752 :     case E_V64QImode:
   25425          752 :       if (!TARGET_AVX512F)
   25426              :         return false;
   25427          752 :       if (d.testing_p && TARGET_AVX512BW)
   25428              :         /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
   25429              :         return true;
   25430              :       break;
   25431        12915 :     case E_V8SImode:
   25432        12915 :     case E_V8SFmode:
   25433        12915 :     case E_V4DFmode:
   25434        12915 :     case E_V4DImode:
   25435        12915 :       if (!TARGET_AVX)
   25436              :         return false;
   25437        12915 :       if (d.testing_p && TARGET_AVX512VL)
   25438              :         /* All implementable with a single vperm[it]2 insn.  */
   25439              :         return true;
   25440              :       break;
   25441          614 :     case E_V16HImode:
   25442          614 :       if (!TARGET_SSE2)
   25443              :         return false;
   25444          614 :       if (d.testing_p && TARGET_AVX2)
   25445              :         /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
   25446              :         return true;
   25447              :       break;
   25448          696 :     case E_V32QImode:
   25449          696 :       if (!TARGET_SSE2)
   25450              :         return false;
   25451          696 :       if (d.testing_p && TARGET_AVX2)
   25452              :         /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
   25453              :         return true;
   25454              :       break;
   25455        37996 :     case E_V8HImode:
   25456        37996 :     case E_V16QImode:
   25457        37996 :       if (!TARGET_SSE2)
   25458              :         return false;
   25459              :       /* Fall through.  */
   25460       237033 :     case E_V4SImode:
   25461       237033 :     case E_V4SFmode:
   25462       237033 :       if (!TARGET_SSE)
   25463              :         return false;
   25464              :       /* All implementable with a single vpperm insn.  */
   25465       237033 :       if (d.testing_p && TARGET_XOP)
   25466              :         return true;
   25467              :       /* All implementable with 2 pshufb + 1 ior.  */
   25468       236927 :       if (d.testing_p && TARGET_SSSE3)
   25469              :         return true;
   25470              :       break;
   25471       138344 :     case E_V2SFmode:
   25472       138344 :     case E_V2SImode:
   25473       138344 :     case E_V4HImode:
   25474       138344 :     case E_V8QImode:
   25475       138344 :       if (!TARGET_MMX_WITH_SSE)
   25476              :         return false;
   25477              :       break;
   25478        25533 :     case E_V2HImode:
   25479        25533 :       if (!TARGET_SSE2)
   25480              :         return false;
   25481              :       /* All implementable with *punpckwd.  */
   25482        25533 :       if (d.testing_p)
   25483              :         return true;
   25484              :       break;
   25485        10804 :     case E_V4QImode:
   25486        10804 :       if (!TARGET_SSE2)
   25487              :         return false;
   25488              :       break;
   25489       390464 :     case E_V2DImode:
   25490       390464 :     case E_V2DFmode:
   25491       390464 :       if (!TARGET_SSE)
   25492              :         return false;
   25493              :       /* All implementable with shufpd or unpck[lh]pd.  */
   25494       390464 :       if (d.testing_p)
   25495              :         return true;
   25496              :       break;
   25497              :     default:
   25498              :       return false;
   25499              :     }
   25500              : 
   25501      2229359 :   for (i = which = 0; i < nelt; ++i)
   25502              :     {
   25503      1824080 :       unsigned char e = sel[i];
   25504      1824080 :       gcc_assert (e < 2 * nelt);
   25505      1824080 :       d.perm[i] = e;
   25506      1824080 :       perm[i] = e;
   25507      2470775 :       which |= (e < nelt ? 1 : 2);
   25508              :     }
   25509              : 
   25510       405279 :   if (d.testing_p)
   25511              :     {
   25512              :       /* For all elements from second vector, fold the elements to first.  */
   25513       332131 :       if (which == 2)
   25514         1345 :         for (i = 0; i < nelt; ++i)
   25515         1240 :           d.perm[i] -= nelt;
   25516              : 
   25517              :       /* Check whether the mask can be applied to the vector type.  */
   25518       332131 :       d.one_operand_p = (which != 3);
   25519              : 
   25520              :       /* Implementable with shufps, pshufd or pshuflw.  */
   25521       332131 :       if (d.one_operand_p
   25522              :           && (d.vmode == V4SFmode || d.vmode == V2SFmode
   25523              :               || d.vmode == V4SImode || d.vmode == V2SImode
   25524              :               || d.vmode == V4HImode || d.vmode == V2HImode))
   25525              :         return true;
   25526              : 
   25527              :       /* Otherwise we have to go through the motions and see if we can
   25528              :          figure out how to generate the requested permutation.  */
   25529       228977 :       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
   25530       228977 :       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
   25531       228977 :       if (!d.one_operand_p)
   25532       215031 :         d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
   25533              : 
   25534       228977 :       start_sequence ();
   25535       228977 :       bool ret = ix86_expand_vec_perm_const_1 (&d);
   25536       228977 :       end_sequence ();
   25537              : 
   25538       228977 :       return ret;
   25539              :     }
   25540              : 
   25541        73148 :   two_args = canonicalize_perm (&d);
   25542              : 
   25543              :   /* If one of the operands is a zero vector, try to match pmovzx.  */
   25544        73148 :   if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
   25545              :     {
   25546          583 :       struct expand_vec_perm_d dzero = d;
   25547          583 :       if (d.op0 == CONST0_RTX (vmode))
   25548              :         {
   25549          387 :           d.op1 = dzero.op1 = force_reg (vmode, d.op1);
   25550          387 :           std::swap (dzero.op0, dzero.op1);
   25551         7527 :           for (i = 0; i < nelt; ++i)
   25552         7140 :             dzero.perm[i] ^= nelt;
   25553              :         }
   25554              :       else
   25555          196 :         d.op0 = dzero.op0 = force_reg (vmode, d.op0);
   25556              : 
   25557          583 :       if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
   25558          583 :                                   dzero.perm, nelt, dzero.testing_p))
   25559          122 :         return true;
   25560              :     }
   25561              : 
   25562              :   /* Force operands into registers.  */
   25563        73026 :   rtx nop0 = force_reg (vmode, d.op0);
   25564        73026 :   if (d.op0 == d.op1)
   25565        18600 :     d.op1 = nop0;
   25566        73026 :   d.op0 = nop0;
   25567        73026 :   d.op1 = force_reg (vmode, d.op1);
   25568              : 
   25569        73026 :   if (ix86_expand_vec_perm_const_1 (&d))
   25570              :     return true;
   25571              : 
   25572              :   /* If the selector says both arguments are needed, but the operands are the
   25573              :      same, the above tried to expand with one_operand_p and flattened selector.
   25574              :      If that didn't work, retry without one_operand_p; we succeeded with that
   25575              :      during testing.  */
   25576           22 :   if (two_args && d.one_operand_p)
   25577              :     {
   25578           22 :       d.one_operand_p = false;
   25579           22 :       memcpy (d.perm, perm, sizeof (perm));
   25580           22 :       return ix86_expand_vec_perm_const_1 (&d);
   25581              :     }
   25582              : 
   25583              :   return false;
   25584              : }
   25585              : 
   25586              : void
   25587         8214 : ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
   25588              : {
   25589         8214 :   struct expand_vec_perm_d d;
   25590         8214 :   unsigned i, nelt;
   25591              : 
   25592         8214 :   d.target = targ;
   25593         8214 :   d.op0 = op0;
   25594         8214 :   d.op1 = op1;
   25595         8214 :   d.vmode = GET_MODE (targ);
   25596         8214 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25597         8214 :   d.one_operand_p = false;
   25598         8214 :   d.testing_p = false;
   25599              : 
   25600        78090 :   for (i = 0; i < nelt; ++i)
   25601        69876 :     d.perm[i] = i * 2 + odd;
   25602              : 
   25603              :   /* We'll either be able to implement the permutation directly...  */
   25604         8214 :   if (expand_vec_perm_1 (&d))
   25605         3185 :     return;
   25606              : 
   25607              :   /* ... or we use the special-case patterns.  */
   25608         5029 :   expand_vec_perm_even_odd_1 (&d, odd);
   25609              : }
   25610              : 
   25611              : static void
   25612          924 : ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   25613              : {
   25614          924 :   struct expand_vec_perm_d d;
   25615          924 :   unsigned i, nelt, base;
   25616          924 :   bool ok;
   25617              : 
   25618          924 :   d.target = targ;
   25619          924 :   d.op0 = op0;
   25620          924 :   d.op1 = op1;
   25621          924 :   d.vmode = GET_MODE (targ);
   25622          924 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25623          924 :   d.one_operand_p = false;
   25624          924 :   d.testing_p = false;
   25625              : 
   25626          924 :   base = high_p ? nelt / 2 : 0;
   25627         3652 :   for (i = 0; i < nelt / 2; ++i)
   25628              :     {
   25629         2728 :       d.perm[i * 2] = i + base;
   25630         2728 :       d.perm[i * 2 + 1] = i + base + nelt;
   25631              :     }
   25632              : 
   25633              :   /* Note that for AVX this isn't one instruction.  */
   25634          924 :   ok = ix86_expand_vec_perm_const_1 (&d);
   25635          924 :   gcc_assert (ok);
   25636          924 : }
   25637              : 
   25638              : /* Expand a vector operation shift by constant for a V*QImode in terms of the
   25639              :    same operation on V*HImode. Return true if success. */
   25640              : static bool
   25641          389 : ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
   25642              :                                      rtx dest, rtx op1, rtx op2)
   25643              : {
   25644          389 :   machine_mode qimode, himode;
   25645          389 :   HOST_WIDE_INT and_constant, xor_constant;
   25646          389 :   HOST_WIDE_INT shift_amount;
   25647          389 :   rtx vec_const_and, vec_const_xor;
   25648          389 :   rtx tmp, op1_subreg;
   25649          389 :   rtx (*gen_shift) (rtx, rtx, rtx);
   25650          389 :   rtx (*gen_and) (rtx, rtx, rtx);
   25651          389 :   rtx (*gen_xor) (rtx, rtx, rtx);
   25652          389 :   rtx (*gen_sub) (rtx, rtx, rtx);
   25653              : 
   25654              :   /* Only optimize shift by constant.  */
   25655          389 :   if (!CONST_INT_P (op2))
   25656              :     return false;
   25657              : 
   25658          389 :   qimode = GET_MODE (dest);
   25659          389 :   shift_amount = INTVAL (op2);
   25660              :   /* Do nothing when shift amount greater equal 8.  */
   25661          389 :   if (shift_amount > 7)
   25662              :     return false;
   25663              : 
   25664          389 :   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
   25665              : 
   25666              : 
   25667          389 :   if (shift_amount == 7
   25668          389 :       && code == ASHIFTRT)
   25669              :     {
   25670           39 :       if (qimode == V16QImode
   25671           10 :           || qimode == V32QImode)
   25672              :         {
   25673           38 :           rtx zero = gen_reg_rtx (qimode);
   25674           38 :           emit_move_insn (zero, CONST0_RTX (qimode));
   25675           38 :           emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
   25676           38 :         }
   25677              :       else
   25678              :         {
   25679            1 :           gcc_assert (qimode == V64QImode);
   25680            1 :           rtx kmask = gen_reg_rtx (DImode);
   25681            1 :           emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
   25682            1 :           emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
   25683              :         }
   25684           39 :       return true;
   25685              :     }
   25686              : 
   25687              :   /* Record sign bit.  */
   25688          350 :   xor_constant = 1 << (8 - shift_amount - 1);
   25689              : 
   25690              :   /* Zero upper/lower bits shift from left/right element.  */
   25691          350 :   and_constant
   25692          350 :     = (code == ASHIFT ? 256 - (1 << shift_amount)
   25693          317 :        : (1 << (8 - shift_amount)) - 1);
   25694              : 
   25695          350 :   switch (qimode)
   25696              :     {
   25697          333 :     case V16QImode:
   25698          333 :       himode = V8HImode;
   25699          281 :       gen_shift =
   25700              :         ((code == ASHIFT)
   25701          333 :          ? gen_ashlv8hi3
   25702          313 :          : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
   25703              :       gen_and = gen_andv16qi3;
   25704              :       gen_xor = gen_xorv16qi3;
   25705              :       gen_sub = gen_subv16qi3;
   25706              :       break;
   25707            6 :     case V32QImode:
   25708            6 :       himode = V16HImode;
   25709            1 :       gen_shift =
   25710              :         ((code == ASHIFT)
   25711            6 :          ? gen_ashlv16hi3
   25712            2 :          : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
   25713              :       gen_and = gen_andv32qi3;
   25714              :       gen_xor = gen_xorv32qi3;
   25715              :       gen_sub = gen_subv32qi3;
   25716              :       break;
   25717           11 :     case V64QImode:
   25718           11 :       himode = V32HImode;
   25719            1 :       gen_shift =
   25720              :         ((code == ASHIFT)
   25721           11 :          ? gen_ashlv32hi3
   25722            2 :          : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
   25723              :       gen_and = gen_andv64qi3;
   25724              :       gen_xor = gen_xorv64qi3;
   25725              :       gen_sub = gen_subv64qi3;
   25726              :       break;
   25727            0 :     default:
   25728            0 :       gcc_unreachable ();
   25729              :     }
   25730              : 
   25731          350 :   tmp = gen_reg_rtx (himode);
   25732          350 :   vec_const_and = gen_reg_rtx (qimode);
   25733          350 :   op1_subreg = lowpart_subreg (himode, op1, qimode);
   25734              : 
   25735              :   /* For ASHIFT and LSHIFTRT, perform operation like
   25736              :      vpsllw/vpsrlw $shift_amount, %op1, %dest.
   25737              :      vpand %vec_const_and, %dest.  */
   25738          350 :   emit_insn (gen_shift (tmp, op1_subreg, op2));
   25739          350 :   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
   25740          350 :   emit_move_insn (vec_const_and,
   25741              :                   ix86_build_const_vector (qimode, true,
   25742          350 :                                            gen_int_mode (and_constant, QImode)));
   25743          350 :   emit_insn (gen_and (dest, dest, vec_const_and));
   25744              : 
   25745              :   /* For ASHIFTRT, perform extra operation like
   25746              :      vpxor %vec_const_xor, %dest, %dest
   25747              :      vpsubb %vec_const_xor, %dest, %dest  */
   25748          350 :   if (code == ASHIFTRT)
   25749              :     {
   25750           34 :       vec_const_xor = gen_reg_rtx (qimode);
   25751           34 :       emit_move_insn (vec_const_xor,
   25752              :                       ix86_build_const_vector (qimode, true,
   25753           34 :                                                gen_int_mode (xor_constant, QImode)));
   25754           34 :       emit_insn (gen_xor (dest, dest, vec_const_xor));
   25755           34 :       emit_insn (gen_sub (dest, dest, vec_const_xor));
   25756              :     }
   25757              :   return true;
   25758              : }
   25759              : 
   25760              : void
   25761         1412 : ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   25762              : {
   25763         1412 :   machine_mode qimode = GET_MODE (dest);
   25764         1412 :   rtx qop1, qop2, hop1, hop2, qdest, hdest;
   25765         1412 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   25766         1412 :   bool uns_p = code != ASHIFTRT;
   25767              : 
   25768         1412 :   switch (qimode)
   25769              :     {
   25770         1412 :     case E_V4QImode:
   25771         1412 :     case E_V8QImode:
   25772         1412 :       break;
   25773            0 :     default:
   25774            0 :       gcc_unreachable ();
   25775              :     }
   25776              : 
   25777         1412 :   qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
   25778              : 
   25779         1412 :   if (op2vec)
   25780         1310 :     qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
   25781              :   else
   25782              :     qop2 = op2;
   25783              : 
   25784         1412 :   qdest = gen_reg_rtx (V16QImode);
   25785              : 
   25786         1412 :   if (CONST_INT_P (op2)
   25787           90 :       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
   25788              :       /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb.
   25789              :          Even with SSE4.1 the alternative is better.  */
   25790           90 :       && !TARGET_SSE4_1
   25791         1466 :       && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
   25792              :     {
   25793           54 :       emit_move_insn (dest, gen_lowpart (qimode, qdest));
   25794           54 :       return;
   25795              :     }
   25796              : 
   25797         1358 :   if (CONST_INT_P (op2)
   25798           36 :       && code == ASHIFTRT
   25799           10 :       && INTVAL (op2) == 7)
   25800              :     {
   25801            3 :       rtx zero = gen_reg_rtx (qimode);
   25802            3 :       emit_move_insn (zero, CONST0_RTX (qimode));
   25803            3 :       emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
   25804            3 :       return;
   25805              :     }
   25806              : 
   25807         1355 :   switch (code)
   25808              :     {
   25809         1297 :     case MULT:
   25810         1297 :       gcc_assert (op2vec);
   25811         1297 :       if (!TARGET_SSE4_1)
   25812              :         {
   25813              :           /* Unpack data such that we've got a source byte in each low byte
   25814              :              of each word.  We don't care what goes into the high byte of
   25815              :              each word.  Rather than trying to get zero in there, most
   25816              :              convenient is to let it be a copy of the low byte.  */
   25817          244 :           hop1 = copy_to_reg (qop1);
   25818          244 :           hop2 = copy_to_reg (qop2);
   25819          244 :           emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
   25820          244 :           emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
   25821          244 :           break;
   25822              :         }
   25823              :       /* FALLTHRU */
   25824         1111 :     case ASHIFT:
   25825         1111 :     case ASHIFTRT:
   25826         1111 :     case LSHIFTRT:
   25827         1111 :       hop1 = gen_reg_rtx (V8HImode);
   25828         1111 :       ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
   25829              :       /* mult/vashr/vlshr/vashl  */
   25830         1111 :       if (op2vec)
   25831              :         {
   25832         1066 :           hop2 = gen_reg_rtx (V8HImode);
   25833         1066 :           ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
   25834              :         }
   25835              :       else
   25836              :         hop2 = qop2;
   25837              : 
   25838              :       break;
   25839            0 :     default:
   25840            0 :       gcc_unreachable ();
   25841              :     }
   25842              : 
   25843         1355 :   if (code != MULT && op2vec)
   25844              :     {
   25845              :       /* Expand vashr/vlshr/vashl.  */
   25846           13 :       hdest = gen_reg_rtx (V8HImode);
   25847           13 :       emit_insn (gen_rtx_SET (hdest,
   25848              :                               simplify_gen_binary (code, V8HImode,
   25849              :                                                    hop1, hop2)));
   25850              :     }
   25851              :   else
   25852              :     /* Expand mult/ashr/lshr/ashl.  */
   25853         1342 :     hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
   25854              :                                 NULL_RTX, 1, OPTAB_DIRECT);
   25855              : 
   25856         1355 :   if (TARGET_AVX512BW && TARGET_AVX512VL)
   25857              :     {
   25858           30 :       if (qimode == V8QImode)
   25859              :         qdest = dest;
   25860              :       else
   25861           10 :         qdest = gen_reg_rtx (V8QImode);
   25862              : 
   25863           30 :       emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
   25864              :     }
   25865              :   else
   25866              :     {
   25867         1325 :       struct expand_vec_perm_d d;
   25868         1325 :       rtx qres = gen_lowpart (V16QImode, hdest);
   25869         1325 :       bool ok;
   25870         1325 :       int i;
   25871              : 
   25872              :       /* Merge the data back into the right place.  */
   25873         1325 :       d.target = qdest;
   25874         1325 :       d.op0 = d.op1 = qres;
   25875         1325 :       d.vmode = V16QImode;
   25876         1325 :       d.nelt = 16;
   25877         1325 :       d.one_operand_p = TARGET_SSSE3;
   25878         1325 :       d.testing_p = false;
   25879              : 
   25880        22525 :       for (i = 0; i < d.nelt; ++i)
   25881        21200 :         d.perm[i] = i * 2;
   25882              : 
   25883         1325 :       ok = ix86_expand_vec_perm_const_1 (&d);
   25884         1325 :       gcc_assert (ok);
   25885              :     }
   25886              : 
   25887         1355 :   if (qdest != dest)
   25888         1335 :     emit_move_insn (dest, gen_lowpart (qimode, qdest));
   25889              : }
   25890              : 
   25891              : /* Emit instruction in 2x wider mode.  For example, optimize
   25892              :    vector MUL generation like
   25893              : 
   25894              :    vpmovzxbw ymm2, xmm0
   25895              :    vpmovzxbw ymm3, xmm1
   25896              :    vpmullw   ymm4, ymm2, ymm3
   25897              :    vpmovwb   xmm0, ymm4
   25898              : 
   25899              :    it would take less instructions than ix86_expand_vecop_qihi.
   25900              :    Return true if success.  */
   25901              : 
   25902              : static bool
   25903         1361 : ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   25904              : {
   25905         1361 :   machine_mode himode, qimode = GET_MODE (dest);
   25906         1361 :   machine_mode wqimode;
   25907         1361 :   rtx qop1, qop2, hop1, hop2, hdest;
   25908         1361 :   rtx (*gen_truncate)(rtx, rtx) = NULL;
   25909         1361 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   25910         1361 :   bool uns_p = code != ASHIFTRT;
   25911              : 
   25912              :   /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
   25913              :      generic permutation to merge the data back into the right place.  This
   25914              :      permutation results in VPERMQ, which is slow, so better fall back to
   25915              :      ix86_expand_vecop_qihi.  */
   25916         1361 :   if (!TARGET_AVX512BW
   25917          301 :       || (qimode == V16QImode && !TARGET_AVX512VL)
   25918              :       /* There are no V64HImode instructions.  */
   25919          301 :       || qimode == V64QImode)
   25920              :      return false;
   25921              : 
   25922              :   /* Do not generate ymm/zmm instructions when
   25923              :      target prefers 128/256 bit vector width.  */
   25924          267 :   if ((qimode == V16QImode && TARGET_PREFER_AVX128)
   25925          267 :       || (qimode == V32QImode && TARGET_PREFER_AVX256))
   25926              :     return false;
   25927              : 
   25928          260 :   switch (qimode)
   25929              :     {
   25930              :     case E_V16QImode:
   25931              :       himode = V16HImode;
   25932              :       gen_truncate = gen_truncv16hiv16qi2;
   25933              :       break;
   25934           17 :     case E_V32QImode:
   25935           17 :       himode = V32HImode;
   25936           17 :       gen_truncate = gen_truncv32hiv32qi2;
   25937           17 :       break;
   25938            0 :     default:
   25939            0 :       gcc_unreachable ();
   25940              :     }
   25941              : 
   25942          260 :   wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
   25943          260 :   qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
   25944              : 
   25945          260 :   if (op2vec)
   25946          260 :     qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
   25947              :   else
   25948              :     qop2 = op2;
   25949              : 
   25950          260 :   hop1 = gen_reg_rtx (himode);
   25951          260 :   ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
   25952              : 
   25953          260 :   if (op2vec)
   25954              :     {
   25955          260 :       hop2 = gen_reg_rtx (himode);
   25956          260 :       ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
   25957              :     }
   25958              :   else
   25959              :     hop2 = qop2;
   25960              : 
   25961          260 :   if (code != MULT && op2vec)
   25962              :     {
   25963              :       /* Expand vashr/vlshr/vashl.  */
   25964           14 :       hdest = gen_reg_rtx (himode);
   25965           14 :       emit_insn (gen_rtx_SET (hdest,
   25966              :                               simplify_gen_binary (code, himode,
   25967              :                                                    hop1, hop2)));
   25968              :     }
   25969              :   else
   25970              :     /* Expand mult/ashr/lshr/ashl.  */
   25971          246 :     hdest = expand_simple_binop (himode, code, hop1, hop2,
   25972              :                                  NULL_RTX, 1, OPTAB_DIRECT);
   25973              : 
   25974          260 :   emit_insn (gen_truncate (dest, hdest));
   25975          260 :   return true;
   25976              : }
   25977              : 
   25978              : /* Expand a vector operation CODE for a V*QImode in terms of the
   25979              :    same operation on V*HImode.  */
   25980              : 
   25981              : void
   25982         1696 : ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   25983              : {
   25984         1696 :   machine_mode qimode = GET_MODE (dest);
   25985         1696 :   machine_mode himode;
   25986         1696 :   rtx (*gen_il) (rtx, rtx, rtx);
   25987         1696 :   rtx (*gen_ih) (rtx, rtx, rtx);
   25988         1696 :   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
   25989         1696 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   25990         1696 :   struct expand_vec_perm_d d;
   25991         1696 :   bool full_interleave = true;
   25992         1696 :   bool uns_p = code != ASHIFTRT;
   25993         1696 :   bool ok;
   25994         1696 :   int i;
   25995              : 
   25996         1696 :   if (CONST_INT_P (op2)
   25997          335 :       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
   25998         2031 :       && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
   25999          595 :     return;
   26000              : 
   26001         1361 :   if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
   26002              :     return;
   26003              : 
   26004         1101 :   switch (qimode)
   26005              :     {
   26006              :     case E_V16QImode:
   26007              :       himode = V8HImode;
   26008              :       break;
   26009          280 :     case E_V32QImode:
   26010          280 :       himode = V16HImode;
   26011          280 :       break;
   26012           34 :     case E_V64QImode:
   26013           34 :       himode = V32HImode;
   26014           34 :       break;
   26015            0 :     default:
   26016            0 :       gcc_unreachable ();
   26017              :     }
   26018              : 
   26019         1101 :   switch (code)
   26020              :     {
   26021         1074 :     case MULT:
   26022         1074 :       gcc_assert (op2vec);
   26023              :       /* Unpack data such that we've got a source byte in each low byte of
   26024              :          each word.  We don't care what goes into the high byte of each word.
   26025              :          Rather than trying to get zero in there, most convenient is to let
   26026              :          it be a copy of the low byte.  */
   26027         1074 :       switch (qimode)
   26028              :         {
   26029              :         case E_V16QImode:
   26030              :           gen_il = gen_vec_interleave_lowv16qi;
   26031              :           gen_ih = gen_vec_interleave_highv16qi;
   26032              :           break;
   26033          280 :         case E_V32QImode:
   26034          280 :           gen_il = gen_avx2_interleave_lowv32qi;
   26035          280 :           gen_ih = gen_avx2_interleave_highv32qi;
   26036          280 :           full_interleave = false;
   26037          280 :           break;
   26038           32 :         case E_V64QImode:
   26039           32 :           gen_il = gen_avx512bw_interleave_lowv64qi;
   26040           32 :           gen_ih = gen_avx512bw_interleave_highv64qi;
   26041           32 :           full_interleave = false;
   26042           32 :           break;
   26043            0 :         default:
   26044            0 :           gcc_unreachable ();
   26045              :         }
   26046              : 
   26047         1074 :       op2_l = gen_reg_rtx (qimode);
   26048         1074 :       op2_h = gen_reg_rtx (qimode);
   26049         1074 :       emit_insn (gen_il (op2_l, op2, op2));
   26050         1074 :       emit_insn (gen_ih (op2_h, op2, op2));
   26051              : 
   26052         1074 :       op1_l = gen_reg_rtx (qimode);
   26053         1074 :       op1_h = gen_reg_rtx (qimode);
   26054         1074 :       emit_insn (gen_il (op1_l, op1, op1));
   26055         1074 :       emit_insn (gen_ih (op1_h, op1, op1));
   26056         1074 :       break;
   26057              : 
   26058           27 :     case ASHIFT:
   26059           27 :     case ASHIFTRT:
   26060           27 :     case LSHIFTRT:
   26061           27 :       op1_l = gen_reg_rtx (himode);
   26062           27 :       op1_h = gen_reg_rtx (himode);
   26063           27 :       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
   26064           27 :       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
   26065              :       /* vashr/vlshr/vashl  */
   26066           27 :       if (op2vec)
   26067              :         {
   26068            2 :           rtx tmp = force_reg (qimode, op2);
   26069            2 :           op2_l = gen_reg_rtx (himode);
   26070            2 :           op2_h = gen_reg_rtx (himode);
   26071            2 :           ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
   26072            2 :           ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
   26073              :         }
   26074              :       else
   26075              :         op2_l = op2_h = op2;
   26076              : 
   26077              :       break;
   26078            0 :     default:
   26079            0 :       gcc_unreachable ();
   26080              :     }
   26081              : 
   26082         1101 :   if (code != MULT && op2vec)
   26083              :     {
   26084              :       /* Expand vashr/vlshr/vashl.  */
   26085            2 :       res_l = gen_reg_rtx (himode);
   26086            2 :       res_h = gen_reg_rtx (himode);
   26087            2 :       emit_insn (gen_rtx_SET (res_l,
   26088              :                               simplify_gen_binary (code, himode,
   26089              :                                                    op1_l, op2_l)));
   26090            2 :       emit_insn (gen_rtx_SET (res_h,
   26091              :                               simplify_gen_binary (code, himode,
   26092              :                                                    op1_h, op2_h)));
   26093              :     }
   26094              :   else
   26095              :     {
   26096              :       /* Expand mult/ashr/lshr/ashl.  */
   26097         1099 :       res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
   26098              :                                    1, OPTAB_DIRECT);
   26099         1099 :       res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
   26100              :                                    1, OPTAB_DIRECT);
   26101              :     }
   26102              : 
   26103         1101 :   gcc_assert (res_l && res_h);
   26104              : 
   26105              :   /* Merge the data back into the right place.  */
   26106         1101 :   d.target = dest;
   26107         1101 :   d.op0 = gen_lowpart (qimode, res_l);
   26108         1101 :   d.op1 = gen_lowpart (qimode, res_h);
   26109         1101 :   d.vmode = qimode;
   26110         1101 :   d.nelt = GET_MODE_NUNITS (qimode);
   26111         1101 :   d.one_operand_p = false;
   26112         1101 :   d.testing_p = false;
   26113              : 
   26114         1101 :   if (full_interleave)
   26115              :     {
   26116              :       /* We used the full interleave, the desired
   26117              :          results are in the even elements.  */
   26118        13509 :       for (i = 0; i < d.nelt; ++i)
   26119        12720 :         d.perm[i] = i * 2;
   26120              :     }
   26121              :   else
   26122              :     {
   26123              :       /* For AVX, the interleave used above was not cross-lane.  So the
   26124              :          extraction is evens but with the second and third quarter swapped.
   26125              :          Happily, that is even one insn shorter than even extraction.
   26126              :          For AVX512BW we have 4 lanes.  We extract evens from within a lane,
   26127              :          always first from the first and then from the second source operand,
   26128              :          the index bits above the low 4 bits remains the same.
   26129              :          Thus, for d.nelt == 32 we want permutation
   26130              :          0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
   26131              :          and for d.nelt == 64 we want permutation
   26132              :          0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
   26133              :          32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
   26134        11320 :       for (i = 0; i < d.nelt; ++i)
   26135        16512 :         d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
   26136              :     }
   26137              : 
   26138         1101 :   ok = ix86_expand_vec_perm_const_1 (&d);
   26139         1101 :   gcc_assert (ok);
   26140              : }
   26141              : 
   26142              : /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
   26143              :    if op is CONST_VECTOR with all odd elements equal to their
   26144              :    preceding element.  */
   26145              : 
   26146              : static bool
   26147         8772 : const_vector_equal_evenodd_p (rtx op)
   26148              : {
   26149         8772 :   machine_mode mode = GET_MODE (op);
   26150         8772 :   int i, nunits = GET_MODE_NUNITS (mode);
   26151         8772 :   if (!CONST_VECTOR_P (op)
   26152         8772 :       || nunits != CONST_VECTOR_NUNITS (op))
   26153              :     return false;
   26154         3579 :   for (i = 0; i < nunits; i += 2)
   26155         2886 :     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
   26156              :       return false;
   26157              :   return true;
   26158              : }
   26159              : 
   26160              : void
   26161         8888 : ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
   26162              :                                bool uns_p, bool odd_p)
   26163              : {
   26164         8888 :   machine_mode mode = GET_MODE (op1);
   26165         8888 :   machine_mode wmode = GET_MODE (dest);
   26166         8888 :   rtx x;
   26167         8888 :   rtx orig_op1 = op1, orig_op2 = op2;
   26168              : 
   26169         8888 :   if (!nonimmediate_operand (op1, mode))
   26170            0 :     op1 = force_reg (mode, op1);
   26171         8888 :   if (!nonimmediate_operand (op2, mode))
   26172         3320 :     op2 = force_reg (mode, op2);
   26173              : 
   26174              :   /* We only play even/odd games with vectors of SImode.  */
   26175         8888 :   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
   26176              : 
   26177              :   /* If we're looking for the odd results, shift those members down to
   26178              :      the even slots.  For some cpus this is faster than a PSHUFD.  */
   26179         8888 :   if (odd_p)
   26180              :     {
   26181              :       /* For XOP use vpmacsdqh, but only for smult, as it is only
   26182              :          signed.  */
   26183         4404 :       if (TARGET_XOP && mode == V4SImode && !uns_p)
   26184              :         {
   26185           18 :           x = force_reg (wmode, CONST0_RTX (wmode));
   26186           18 :           emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
   26187           18 :           return;
   26188              :         }
   26189              : 
   26190         8772 :       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
   26191         4386 :       if (!const_vector_equal_evenodd_p (orig_op1))
   26192         4386 :         op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
   26193              :                             x, NULL, 1, OPTAB_DIRECT);
   26194         4386 :       if (!const_vector_equal_evenodd_p (orig_op2))
   26195         3693 :         op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
   26196              :                             x, NULL, 1, OPTAB_DIRECT);
   26197         4386 :       op1 = gen_lowpart (mode, op1);
   26198         4386 :       op2 = gen_lowpart (mode, op2);
   26199              :     }
   26200              : 
   26201         8870 :   if (mode == V16SImode)
   26202              :     {
   26203           10 :       if (uns_p)
   26204            0 :         x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
   26205              :       else
   26206           10 :         x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
   26207              :     }
   26208         8860 :   else if (mode == V8SImode)
   26209              :     {
   26210          147 :       if (uns_p)
   26211           59 :         x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
   26212              :       else
   26213           88 :         x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
   26214              :     }
   26215         8713 :   else if (uns_p)
   26216         7658 :     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
   26217         1055 :   else if (TARGET_SSE4_1)
   26218          369 :     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
   26219              :   else
   26220              :     {
   26221          686 :       rtx s1, s2, t0, t1, t2;
   26222              : 
   26223              :       /* The easiest way to implement this without PMULDQ is to go through
   26224              :          the motions as if we are performing a full 64-bit multiply.  With
   26225              :          the exception that we need to do less shuffling of the elements.  */
   26226              : 
   26227              :       /* Compute the sign-extension, aka highparts, of the two operands.  */
   26228          686 :       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
   26229              :                                 op1, pc_rtx, pc_rtx);
   26230          686 :       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
   26231              :                                 op2, pc_rtx, pc_rtx);
   26232              : 
   26233              :       /* Multiply LO(A) * HI(B), and vice-versa.  */
   26234          686 :       t1 = gen_reg_rtx (wmode);
   26235          686 :       t2 = gen_reg_rtx (wmode);
   26236          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
   26237          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
   26238              : 
   26239              :       /* Multiply LO(A) * LO(B).  */
   26240          686 :       t0 = gen_reg_rtx (wmode);
   26241          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
   26242              : 
   26243              :       /* Combine and shift the highparts into place.  */
   26244          686 :       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
   26245          686 :       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
   26246              :                          1, OPTAB_DIRECT);
   26247              : 
   26248              :       /* Combine high and low parts.  */
   26249          686 :       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
   26250          686 :       return;
   26251              :     }
   26252         8184 :   emit_insn (x);
   26253              : }
   26254              : 
   26255              : void
   26256          985 : ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
   26257              :                             bool uns_p, bool high_p)
   26258              : {
   26259          985 :   machine_mode wmode = GET_MODE (dest);
   26260          985 :   machine_mode mode = GET_MODE (op1);
   26261          985 :   rtx t1, t2, t3, t4, mask;
   26262              : 
   26263          985 :   switch (mode)
   26264              :     {
   26265          297 :     case E_V4SImode:
   26266          297 :       t1 = gen_reg_rtx (mode);
   26267          297 :       t2 = gen_reg_rtx (mode);
   26268          297 :       if (TARGET_XOP && !uns_p)
   26269              :         {
   26270              :           /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
   26271              :              shuffle the elements once so that all elements are in the right
   26272              :              place for immediate use: { A C B D }.  */
   26273           33 :           emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
   26274              :                                         const1_rtx, GEN_INT (3)));
   26275           33 :           emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
   26276              :                                         const1_rtx, GEN_INT (3)));
   26277              :         }
   26278              :       else
   26279              :         {
   26280              :           /* Put the elements into place for the multiply.  */
   26281          264 :           ix86_expand_vec_interleave (t1, op1, op1, high_p);
   26282          264 :           ix86_expand_vec_interleave (t2, op2, op2, high_p);
   26283          264 :           high_p = false;
   26284              :         }
   26285          297 :       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
   26286          297 :       break;
   26287              : 
   26288           78 :     case E_V8SImode:
   26289              :       /* Shuffle the elements between the lanes.  After this we
   26290              :          have { A B E F | C D G H } for each operand.  */
   26291           78 :       t1 = gen_reg_rtx (V4DImode);
   26292           78 :       t2 = gen_reg_rtx (V4DImode);
   26293           78 :       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
   26294              :                                       const0_rtx, const2_rtx,
   26295              :                                       const1_rtx, GEN_INT (3)));
   26296           78 :       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
   26297              :                                       const0_rtx, const2_rtx,
   26298              :                                       const1_rtx, GEN_INT (3)));
   26299              : 
   26300              :       /* Shuffle the elements within the lanes.  After this we
   26301              :          have { A A B B | C C D D } or { E E F F | G G H H }.  */
   26302           78 :       t3 = gen_reg_rtx (V8SImode);
   26303           78 :       t4 = gen_reg_rtx (V8SImode);
   26304          117 :       mask = GEN_INT (high_p
   26305              :                       ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
   26306              :                       : 0 + (0 << 2) + (1 << 4) + (1 << 6));
   26307           78 :       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
   26308           78 :       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
   26309              : 
   26310           78 :       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
   26311           78 :       break;
   26312              : 
   26313          396 :     case E_V8HImode:
   26314          396 :     case E_V16HImode:
   26315          396 :       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
   26316              :                          uns_p, OPTAB_DIRECT);
   26317          630 :       t2 = expand_binop (mode,
   26318              :                          uns_p ? umul_highpart_optab : smul_highpart_optab,
   26319              :                          op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
   26320          396 :       gcc_assert (t1 && t2);
   26321              : 
   26322          396 :       t3 = gen_reg_rtx (mode);
   26323          396 :       ix86_expand_vec_interleave (t3, t1, t2, high_p);
   26324          396 :       emit_move_insn (dest, gen_lowpart (wmode, t3));
   26325          396 :       break;
   26326              : 
   26327          214 :     case E_V16QImode:
   26328          214 :     case E_V32QImode:
   26329          214 :     case E_V32HImode:
   26330          214 :     case E_V16SImode:
   26331          214 :     case E_V64QImode:
   26332          214 :       t1 = gen_reg_rtx (wmode);
   26333          214 :       t2 = gen_reg_rtx (wmode);
   26334          214 :       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
   26335          214 :       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
   26336              : 
   26337          214 :       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
   26338          214 :       break;
   26339              : 
   26340            0 :     default:
   26341            0 :       gcc_unreachable ();
   26342              :     }
   26343          985 : }
   26344              : 
   26345              : void
   26346         3661 : ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
   26347              : {
   26348         3661 :   rtx res_1, res_2, res_3, res_4;
   26349              : 
   26350         3661 :   res_1 = gen_reg_rtx (V4SImode);
   26351         3661 :   res_2 = gen_reg_rtx (V4SImode);
   26352         3661 :   res_3 = gen_reg_rtx (V2DImode);
   26353         3661 :   res_4 = gen_reg_rtx (V2DImode);
   26354         3661 :   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
   26355         3661 :   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
   26356              : 
   26357              :   /* Move the results in element 2 down to element 1; we don't care
   26358              :      what goes in elements 2 and 3.  Then we can merge the parts
   26359              :      back together with an interleave.
   26360              : 
   26361              :      Note that two other sequences were tried:
   26362              :      (1) Use interleaves at the start instead of psrldq, which allows
   26363              :      us to use a single shufps to merge things back at the end.
   26364              :      (2) Use shufps here to combine the two vectors, then pshufd to
   26365              :      put the elements in the correct order.
   26366              :      In both cases the cost of the reformatting stall was too high
   26367              :      and the overall sequence slower.  */
   26368              : 
   26369         3661 :   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
   26370              :                                 const0_rtx, const2_rtx,
   26371              :                                 const0_rtx, const0_rtx));
   26372         3661 :   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
   26373              :                                 const0_rtx, const2_rtx,
   26374              :                                 const0_rtx, const0_rtx));
   26375         3661 :   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
   26376              : 
   26377         3661 :   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
   26378         3661 : }
   26379              : 
   26380              : void
   26381          541 : ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
   26382              : {
   26383          541 :   machine_mode mode = GET_MODE (op0);
   26384          541 :   rtx t1, t2, t3, t4, t5, t6;
   26385              : 
   26386          541 :   if (TARGET_AVX512DQ && mode == V8DImode)
   26387           32 :     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
   26388          509 :   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
   26389           32 :     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
   26390          477 :   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
   26391           36 :     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
   26392          441 :   else if (TARGET_XOP && mode == V2DImode)
   26393              :     {
   26394              :       /* op1: A,B,C,D, op2: E,F,G,H */
   26395            2 :       op1 = gen_lowpart (V4SImode, op1);
   26396            2 :       op2 = gen_lowpart (V4SImode, op2);
   26397              : 
   26398            2 :       t1 = gen_reg_rtx (V4SImode);
   26399            2 :       t2 = gen_reg_rtx (V4SImode);
   26400            2 :       t3 = gen_reg_rtx (V2DImode);
   26401            2 :       t4 = gen_reg_rtx (V2DImode);
   26402              : 
   26403              :       /* t1: B,A,D,C */
   26404            2 :       emit_insn (gen_sse2_pshufd_1 (t1, op1,
   26405              :                                     GEN_INT (1),
   26406              :                                     GEN_INT (0),
   26407              :                                     GEN_INT (3),
   26408              :                                     GEN_INT (2)));
   26409              : 
   26410              :       /* t2: (B*E),(A*F),(D*G),(C*H) */
   26411            2 :       emit_insn (gen_mulv4si3 (t2, t1, op2));
   26412              : 
   26413              :       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
   26414            2 :       emit_insn (gen_xop_phadddq (t3, t2));
   26415              : 
   26416              :       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
   26417            2 :       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
   26418              : 
   26419              :       /* Multiply lower parts and add all */
   26420            2 :       t5 = gen_reg_rtx (V2DImode);
   26421            2 :       emit_insn (gen_vec_widen_umult_even_v4si (t5,
   26422            2 :                                         gen_lowpart (V4SImode, op1),
   26423            2 :                                         gen_lowpart (V4SImode, op2)));
   26424            2 :       force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
   26425              :     }
   26426              :   else
   26427              :     {
   26428          439 :       machine_mode nmode;
   26429          439 :       rtx (*umul) (rtx, rtx, rtx);
   26430              : 
   26431          439 :       if (mode == V2DImode)
   26432              :         {
   26433              :           umul = gen_vec_widen_umult_even_v4si;
   26434              :           nmode = V4SImode;
   26435              :         }
   26436          327 :       else if (mode == V4DImode)
   26437              :         {
   26438              :           umul = gen_vec_widen_umult_even_v8si;
   26439              :           nmode = V8SImode;
   26440              :         }
   26441          116 :       else if (mode == V8DImode)
   26442              :         {
   26443              :           umul = gen_vec_widen_umult_even_v16si;
   26444              :           nmode = V16SImode;
   26445              :         }
   26446              :       else
   26447            0 :         gcc_unreachable ();
   26448              : 
   26449              : 
   26450              :       /* Multiply low parts.  */
   26451          439 :       t1 = gen_reg_rtx (mode);
   26452          439 :       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
   26453              : 
   26454              :       /* Shift input vectors right 32 bits so we can multiply high parts.  */
   26455          439 :       t6 = GEN_INT (32);
   26456          439 :       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
   26457          439 :       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
   26458              : 
   26459              :       /* Multiply high parts by low parts.  */
   26460          439 :       t4 = gen_reg_rtx (mode);
   26461          439 :       t5 = gen_reg_rtx (mode);
   26462          439 :       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
   26463          439 :       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
   26464              : 
   26465              :       /* Combine and shift the highparts back.  */
   26466          439 :       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
   26467          439 :       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
   26468              : 
   26469              :       /* Combine high and low parts.  */
   26470          439 :       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
   26471              :     }
   26472              : 
   26473          541 :   set_unique_reg_note (get_last_insn (), REG_EQUAL,
   26474              :                        gen_rtx_MULT (mode, op1, op2));
   26475          541 : }
   26476              : 
   26477              : /* Return 1 if control tansfer instruction INSN
   26478              :    should be encoded with notrack prefix.  */
   26479              : 
   26480              : bool
   26481     14876764 : ix86_notrack_prefixed_insn_p (rtx_insn *insn)
   26482              : {
   26483     14876764 :   if (!insn || !((flag_cf_protection & CF_BRANCH)))
   26484              :     return false;
   26485              : 
   26486      3967893 :   if (CALL_P (insn))
   26487              :     {
   26488      1395229 :       rtx call = get_call_rtx_from (insn);
   26489      1395229 :       gcc_assert (call != NULL_RTX);
   26490      1395229 :       rtx addr = XEXP (call, 0);
   26491              : 
   26492              :       /* Do not emit 'notrack' if it's not an indirect call.  */
   26493      1395229 :       if (MEM_P (addr)
   26494      1395229 :           && SYMBOL_REF_P (XEXP (addr, 0)))
   26495              :         return false;
   26496              :       else
   26497        65130 :         return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
   26498              :     }
   26499              : 
   26500      2572664 :   if (JUMP_P (insn) && !flag_cet_switch)
   26501              :     {
   26502      2559116 :       rtx target = JUMP_LABEL (insn);
   26503      2559116 :       if (target == NULL_RTX || ANY_RETURN_P (target))
   26504              :         return false;
   26505              : 
   26506              :       /* Check the jump is a switch table.  */
   26507      2559078 :       rtx_insn *label = as_a<rtx_insn *> (target);
   26508      2559078 :       rtx_insn *table = next_insn (label);
   26509      2559078 :       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
   26510              :         return false;
   26511              :       else
   26512              :         return true;
   26513              :     }
   26514              :   return false;
   26515              : }
   26516              : 
   26517              : /* Calculate integer abs() using only SSE2 instructions.  */
   26518              : 
   26519              : void
   26520          629 : ix86_expand_sse2_abs (rtx target, rtx input)
   26521              : {
   26522          629 :   machine_mode mode = GET_MODE (target);
   26523          629 :   rtx tmp0, tmp1, x;
   26524              : 
   26525          629 :   switch (mode)
   26526              :     {
   26527           33 :     case E_V2DImode:
   26528           33 :     case E_V4DImode:
   26529              :       /* For 64-bit signed integer X, with SSE4.2 use
   26530              :          pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
   26531              :          Otherwise handle it similarly to V4SImode, except use 64 as W instead of
   26532              :          32 and use logical instead of arithmetic right shift (which is
   26533              :          unimplemented) and subtract.  */
   26534           33 :       if (TARGET_SSE4_2)
   26535              :         {
   26536            9 :           tmp0 = gen_reg_rtx (mode);
   26537            9 :           tmp1 = gen_reg_rtx (mode);
   26538            9 :           emit_move_insn (tmp1, CONST0_RTX (mode));
   26539            9 :           if (mode == E_V2DImode)
   26540            6 :             emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
   26541              :           else
   26542            3 :             emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
   26543              :         }
   26544              :       else
   26545              :         {
   26546           48 :           tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
   26547           24 :                                       GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
   26548              :                                                - 1), NULL, 0, OPTAB_DIRECT);
   26549           24 :           tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
   26550              :         }
   26551              : 
   26552           33 :       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
   26553              :                                   NULL, 0, OPTAB_DIRECT);
   26554           33 :       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
   26555              :                                target, 0, OPTAB_DIRECT);
   26556           33 :       break;
   26557              : 
   26558           61 :     case E_V4SImode:
   26559              :       /* For 32-bit signed integer X, the best way to calculate the absolute
   26560              :          value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
   26561           61 :       tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
   26562           61 :                                   GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
   26563              :                                   NULL, 0, OPTAB_DIRECT);
   26564           61 :       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
   26565              :                                   NULL, 0, OPTAB_DIRECT);
   26566           61 :       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
   26567              :                                target, 0, OPTAB_DIRECT);
   26568           61 :       break;
   26569              : 
   26570           91 :     case E_V8HImode:
   26571              :       /* For 16-bit signed integer X, the best way to calculate the absolute
   26572              :          value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
   26573           91 :       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
   26574              : 
   26575           91 :       x = expand_simple_binop (mode, SMAX, tmp0, input,
   26576              :                                target, 0, OPTAB_DIRECT);
   26577           91 :       break;
   26578              : 
   26579          444 :     case E_V16QImode:
   26580              :       /* For 8-bit signed integer X, the best way to calculate the absolute
   26581              :          value of X is min ((unsigned char) X, (unsigned char) (-X)),
   26582              :          as SSE2 provides the PMINUB insn.  */
   26583          444 :       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
   26584              : 
   26585          444 :       x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
   26586              :                                target, 0, OPTAB_DIRECT);
   26587          444 :       break;
   26588              : 
   26589            0 :     default:
   26590            0 :       gcc_unreachable ();
   26591              :     }
   26592              : 
   26593          629 :   if (x != target)
   26594            0 :     emit_move_insn (target, x);
   26595          629 : }
   26596              : 
   26597              : /* Expand an extract from a vector register through pextr insn.
   26598              :    Return true if successful.  */
   26599              : 
   26600              : bool
   26601       101999 : ix86_expand_pextr (rtx *operands)
   26602              : {
   26603       101999 :   rtx dst = operands[0];
   26604       101999 :   rtx src = operands[1];
   26605              : 
   26606       101999 :   unsigned int size = INTVAL (operands[2]);
   26607       101999 :   unsigned int pos = INTVAL (operands[3]);
   26608              : 
   26609       101999 :   if (SUBREG_P (dst))
   26610              :     {
   26611              :       /* Reject non-lowpart subregs.  */
   26612        58870 :       if (SUBREG_BYTE (dst) > 0)
   26613              :         return false;
   26614        58743 :       dst = SUBREG_REG (dst);
   26615              :     }
   26616              : 
   26617       101872 :   if (SUBREG_P (src))
   26618              :     {
   26619        33746 :       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
   26620        33746 :       src = SUBREG_REG (src);
   26621              :     }
   26622              : 
   26623       101872 :   switch (GET_MODE (src))
   26624              :     {
   26625            0 :     case E_V16QImode:
   26626            0 :     case E_V8HImode:
   26627            0 :     case E_V4SImode:
   26628            0 :     case E_V2DImode:
   26629            0 :     case E_V1TImode:
   26630            0 :       {
   26631            0 :         machine_mode srcmode, dstmode;
   26632            0 :         rtx d, pat;
   26633              : 
   26634            0 :         if (!int_mode_for_size (size, 0).exists (&dstmode))
   26635            0 :           return false;
   26636              : 
   26637            0 :         switch (dstmode)
   26638              :           {
   26639            0 :           case E_QImode:
   26640            0 :             if (!TARGET_SSE4_1)
   26641              :               return false;
   26642              :             srcmode = V16QImode;
   26643              :             break;
   26644              : 
   26645            0 :           case E_HImode:
   26646            0 :             if (!TARGET_SSE2)
   26647              :               return false;
   26648              :             srcmode = V8HImode;
   26649              :             break;
   26650              : 
   26651            0 :           case E_SImode:
   26652            0 :             if (!TARGET_SSE4_1)
   26653              :               return false;
   26654              :             srcmode = V4SImode;
   26655              :             break;
   26656              : 
   26657            0 :           case E_DImode:
   26658            0 :             gcc_assert (TARGET_64BIT);
   26659            0 :             if (!TARGET_SSE4_1)
   26660              :               return false;
   26661              :             srcmode = V2DImode;
   26662              :             break;
   26663              : 
   26664              :           default:
   26665              :             return false;
   26666              :           }
   26667              : 
   26668              :         /* Reject extractions from misaligned positions.  */
   26669            0 :         if (pos & (size-1))
   26670              :           return false;
   26671              : 
   26672            0 :         if (GET_MODE (dst) == dstmode)
   26673              :           d = dst;
   26674              :         else
   26675            0 :           d = gen_reg_rtx (dstmode);
   26676              : 
   26677              :         /* Construct insn pattern.  */
   26678            0 :         pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
   26679            0 :         pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
   26680              : 
   26681              :         /* Let the rtl optimizers know about the zero extension performed.  */
   26682            0 :         if (dstmode == QImode || dstmode == HImode)
   26683              :           {
   26684            0 :             pat = gen_rtx_ZERO_EXTEND (SImode, pat);
   26685            0 :             d = gen_lowpart (SImode, d);
   26686              :           }
   26687              : 
   26688            0 :         emit_insn (gen_rtx_SET (d, pat));
   26689              : 
   26690            0 :         if (d != dst)
   26691            0 :           emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
   26692              :         return true;
   26693              :       }
   26694              : 
   26695              :     default:
   26696              :       return false;
   26697              :     }
   26698              : }
   26699              : 
   26700              : /* Expand an insert into a vector register through pinsr insn.
   26701              :    Return true if successful.  */
   26702              : 
   26703              : bool
   26704       108771 : ix86_expand_pinsr (rtx *operands)
   26705              : {
   26706       108771 :   rtx dst = operands[0];
   26707       108771 :   rtx src = operands[3];
   26708              : 
   26709       108771 :   unsigned int size = INTVAL (operands[1]);
   26710       108771 :   unsigned int pos = INTVAL (operands[2]);
   26711              : 
   26712       108771 :   if (SUBREG_P (dst))
   26713              :     {
   26714        60649 :       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
   26715        60649 :       dst = SUBREG_REG (dst);
   26716              :     }
   26717              : 
   26718       108771 :   switch (GET_MODE (dst))
   26719              :     {
   26720           20 :     case E_V16QImode:
   26721           20 :     case E_V8HImode:
   26722           20 :     case E_V4SImode:
   26723           20 :     case E_V2DImode:
   26724           20 :     case E_V1TImode:
   26725           20 :       {
   26726           20 :         machine_mode srcmode, dstmode;
   26727           20 :         rtx (*pinsr)(rtx, rtx, rtx, rtx);
   26728           20 :         rtx d;
   26729              : 
   26730           20 :         if (!int_mode_for_size (size, 0).exists (&srcmode))
   26731            0 :           return false;
   26732              : 
   26733           20 :         switch (srcmode)
   26734              :           {
   26735            1 :           case E_QImode:
   26736            1 :             if (!TARGET_SSE4_1)
   26737              :               return false;
   26738              :             dstmode = V16QImode;
   26739              :             pinsr = gen_sse4_1_pinsrb;
   26740              :             break;
   26741              : 
   26742            5 :           case E_HImode:
   26743            5 :             if (!TARGET_SSE2)
   26744              :               return false;
   26745              :             dstmode = V8HImode;
   26746              :             pinsr = gen_sse2_pinsrw;
   26747              :             break;
   26748              : 
   26749           14 :           case E_SImode:
   26750           14 :             if (!TARGET_SSE4_1)
   26751              :               return false;
   26752              :             dstmode = V4SImode;
   26753              :             pinsr = gen_sse4_1_pinsrd;
   26754              :             break;
   26755              : 
   26756            0 :           case E_DImode:
   26757            0 :             gcc_assert (TARGET_64BIT);
   26758            0 :             if (!TARGET_SSE4_1)
   26759              :               return false;
   26760              :             dstmode = V2DImode;
   26761              :             pinsr = gen_sse4_1_pinsrq;
   26762              :             break;
   26763              : 
   26764              :           default:
   26765              :             return false;
   26766              :           }
   26767              : 
   26768              :         /* Reject insertions to misaligned positions.  */
   26769            7 :         if (pos & (size-1))
   26770              :           return false;
   26771              : 
   26772            7 :         if (SUBREG_P (src))
   26773              :           {
   26774            7 :             unsigned int srcpos = SUBREG_BYTE (src);
   26775              : 
   26776            7 :             if (srcpos > 0)
   26777              :               {
   26778            0 :                 rtx extr_ops[4];
   26779              : 
   26780            0 :                 extr_ops[0] = gen_reg_rtx (srcmode);
   26781            0 :                 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
   26782            0 :                 extr_ops[2] = GEN_INT (size);
   26783            0 :                 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
   26784              : 
   26785            0 :                 if (!ix86_expand_pextr (extr_ops))
   26786            0 :                   return false;
   26787              : 
   26788            0 :                 src = extr_ops[0];
   26789              :               }
   26790              :             else
   26791            7 :               src = gen_lowpart (srcmode, SUBREG_REG (src));
   26792              :           }
   26793              : 
   26794            7 :         if (GET_MODE (dst) == dstmode)
   26795              :           d = dst;
   26796              :         else
   26797            7 :           d = gen_reg_rtx (dstmode);
   26798              : 
   26799            7 :         emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
   26800            7 :                           gen_lowpart (srcmode, src),
   26801            7 :                           GEN_INT (1 << (pos / size))));
   26802            7 :         if (d != dst)
   26803            7 :           emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
   26804              :         return true;
   26805              :       }
   26806              : 
   26807              :     default:
   26808              :       return false;
   26809              :     }
   26810              : }
   26811              : 
   26812              : /* All CPUs prefer to avoid cross-lane operations so perform reductions
   26813              :    upper against lower halves up to SSE reg size.  */
   26814              : 
   26815              : machine_mode
   26816         1894 : ix86_split_reduction (machine_mode mode)
   26817              : {
   26818              :   /* Reduce lowpart against highpart until we reach SSE reg width to
   26819              :      avoid cross-lane operations.  */
   26820         1894 :   switch (mode)
   26821              :     {
   26822              :     case E_V8DImode:
   26823              :     case E_V4DImode:
   26824              :       return V2DImode;
   26825            9 :     case E_V16SImode:
   26826            9 :     case E_V8SImode:
   26827            9 :       return V4SImode;
   26828            8 :     case E_V32HImode:
   26829            8 :     case E_V16HImode:
   26830            8 :       return V8HImode;
   26831            4 :     case E_V64QImode:
   26832            4 :     case E_V32QImode:
   26833            4 :       return V16QImode;
   26834            5 :     case E_V16SFmode:
   26835            5 :     case E_V8SFmode:
   26836            5 :       return V4SFmode;
   26837           16 :     case E_V8DFmode:
   26838           16 :     case E_V4DFmode:
   26839           16 :       return V2DFmode;
   26840         1847 :     default:
   26841         1847 :       return mode;
   26842              :     }
   26843              : }
   26844              : 
   26845              : /* Generate call to __divmoddi4.  */
   26846              : 
   26847              : void
   26848          896 : ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
   26849              :                             rtx op0, rtx op1,
   26850              :                             rtx *quot_p, rtx *rem_p)
   26851              : {
   26852         1792 :   rtx rem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   26853              : 
   26854          896 :   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
   26855              :                                       mode, op0, mode, op1, mode,
   26856          896 :                                       XEXP (rem, 0), Pmode);
   26857          896 :   *quot_p = quot;
   26858          896 :   *rem_p = rem;
   26859          896 : }
   26860              : 
   26861              : void
   26862           64 : ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
   26863              :                                   enum rtx_code code, bool after,
   26864              :                                   bool doubleword)
   26865              : {
   26866           64 :   rtx old_reg, new_reg, old_mem, success;
   26867           64 :   machine_mode mode = GET_MODE (target);
   26868           64 :   rtx_code_label *loop_label = NULL;
   26869              : 
   26870           64 :   old_reg = gen_reg_rtx (mode);
   26871           64 :   new_reg = old_reg;
   26872           64 :   old_mem = copy_to_reg (mem);
   26873           64 :   loop_label = gen_label_rtx ();
   26874           64 :   emit_label (loop_label);
   26875           64 :   emit_move_insn (old_reg, old_mem);
   26876              : 
   26877              :   /* return value for atomic_fetch_op.  */
   26878           64 :   if (!after)
   26879           32 :     emit_move_insn (target, old_reg);
   26880              : 
   26881           64 :   if (code == NOT)
   26882              :     {
   26883           16 :       new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
   26884              :                                      true, OPTAB_LIB_WIDEN);
   26885           16 :       new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
   26886              :     }
   26887              :   else
   26888           48 :     new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
   26889              :                                    true, OPTAB_LIB_WIDEN);
   26890              : 
   26891              :   /* return value for atomic_op_fetch.  */
   26892           64 :   if (after)
   26893           32 :     emit_move_insn (target, new_reg);
   26894              : 
   26895           64 :   success = NULL_RTX;
   26896              : 
   26897           64 :   ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
   26898              :                             gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
   26899              :                                           SImode),
   26900              :                             doubleword, loop_label);
   26901           64 : }
   26902              : 
   26903              : /* Relax cmpxchg instruction, param loop_label indicates whether
   26904              :    the instruction should be relaxed with a pause loop.  If not,
   26905              :    it will be relaxed to an atomic load + compare, and skip
   26906              :    cmpxchg instruction if mem != exp_input.  */
   26907              : 
   26908              : void
   26909           72 : ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
   26910              :                           rtx mem, rtx exp_input, rtx new_input,
   26911              :                           rtx mem_model, bool doubleword,
   26912              :                           rtx_code_label *loop_label)
   26913              : {
   26914           72 :   rtx_code_label *cmp_label = NULL;
   26915           72 :   rtx_code_label *done_label = NULL;
   26916           72 :   rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
   26917           72 :   rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
   26918           72 :   rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
   26919           72 :   machine_mode mode = GET_MODE (target_val), hmode = mode;
   26920              : 
   26921           72 :   if (*ptarget_bool == NULL)
   26922           64 :     target_bool = gen_reg_rtx (QImode);
   26923              :   else
   26924              :     target_bool = *ptarget_bool;
   26925              : 
   26926           72 :   cmp_label = gen_label_rtx ();
   26927           72 :   done_label = gen_label_rtx ();
   26928              : 
   26929           72 :   new_mem = gen_reg_rtx (mode);
   26930              :   /* Load memory first.  */
   26931           72 :   expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
   26932              : 
   26933           72 :   switch (mode)
   26934              :     {
   26935              :     case E_TImode:
   26936              :       gendw = gen_atomic_compare_and_swapti_doubleword;
   26937              :       hmode = DImode;
   26938              :       break;
   26939           18 :     case E_DImode:
   26940           18 :       if (doubleword)
   26941              :         {
   26942              :           gendw = gen_atomic_compare_and_swapdi_doubleword;
   26943              :           hmode = SImode;
   26944              :         }
   26945              :       else
   26946              :         gen = gen_atomic_compare_and_swapdi_1;
   26947              :       break;
   26948           18 :     case E_SImode:
   26949           18 :       gen = gen_atomic_compare_and_swapsi_1;
   26950           18 :       break;
   26951           18 :     case E_HImode:
   26952           18 :       gen = gen_atomic_compare_and_swaphi_1;
   26953           18 :       break;
   26954           18 :     case E_QImode:
   26955           18 :       gen = gen_atomic_compare_and_swapqi_1;
   26956           18 :       break;
   26957            0 :     default:
   26958            0 :       gcc_unreachable ();
   26959              :     }
   26960              : 
   26961              :   /* Compare mem value with expected value.  */
   26962           54 :   if (doubleword)
   26963              :     {
   26964            0 :       rtx low_new_mem = gen_lowpart (hmode, new_mem);
   26965            0 :       rtx low_exp_input = gen_lowpart (hmode, exp_input);
   26966            0 :       rtx high_new_mem = gen_highpart (hmode, new_mem);
   26967            0 :       rtx high_exp_input = gen_highpart (hmode, exp_input);
   26968            0 :       emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
   26969              :                                hmode, 1, cmp_label,
   26970              :                                profile_probability::guessed_never ());
   26971            0 :       emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
   26972              :                                hmode, 1, cmp_label,
   26973              :                                profile_probability::guessed_never ());
   26974              :     }
   26975              :   else
   26976           72 :     emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
   26977           72 :                              GET_MODE (exp_input), 1, cmp_label,
   26978              :                              profile_probability::guessed_never ());
   26979              : 
   26980              :   /* Directly emits cmpxchg here.  */
   26981           72 :   if (doubleword)
   26982            0 :     emit_insn (gendw (target_val, mem, exp_input,
   26983            0 :                       gen_lowpart (hmode, new_input),
   26984              :                       gen_highpart (hmode, new_input),
   26985              :                       mem_model));
   26986              :   else
   26987           72 :     emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
   26988              : 
   26989           72 :   if (!loop_label)
   26990              :   {
   26991            8 :     emit_jump_insn (gen_jump (done_label));
   26992            8 :     emit_barrier ();
   26993            8 :     emit_label (cmp_label);
   26994            8 :     emit_move_insn (target_val, new_mem);
   26995            8 :     emit_label (done_label);
   26996            8 :     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
   26997              :                        const0_rtx);
   26998              :   }
   26999              :   else
   27000              :   {
   27001           64 :     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
   27002              :                        const0_rtx);
   27003           64 :     emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
   27004           64 :                              GET_MODE (target_bool), 1, loop_label,
   27005              :                              profile_probability::guessed_never ());
   27006           64 :     emit_jump_insn (gen_jump (done_label));
   27007           64 :     emit_barrier ();
   27008              : 
   27009              :     /* If mem is not expected, pause and loop back.  */
   27010           64 :     emit_label (cmp_label);
   27011           64 :     emit_move_insn (target_val, new_mem);
   27012           64 :     emit_insn (gen_pause ());
   27013           64 :     emit_jump_insn (gen_jump (loop_label));
   27014           64 :     emit_barrier ();
   27015           64 :     emit_label (done_label);
   27016              :   }
   27017              : 
   27018           72 :   *ptarget_bool = target_bool;
   27019           72 : }
   27020              : 
   27021              : /* Convert a BFmode VAL to SFmode without signaling sNaNs.
   27022              :    This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16.  */
   27023              : 
   27024              : rtx
   27025         2832 : ix86_expand_fast_convert_bf_to_sf (rtx val)
   27026              : {
   27027         2832 :   rtx op = gen_lowpart (HImode, val), ret;
   27028         2832 :   if (CONST_INT_P (op))
   27029              :     {
   27030          514 :       ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
   27031              :                                             val, BFmode);
   27032          514 :       if (ret)
   27033              :         return ret;
   27034              :       /* FLOAT_EXTEND simplification will fail if VAL is a sNaN.  */
   27035            1 :       ret = gen_reg_rtx (SImode);
   27036            1 :       emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
   27037            1 :       emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
   27038            1 :       return gen_lowpart (SFmode, ret);
   27039              :     }
   27040              : 
   27041         2318 :   ret = gen_reg_rtx (SFmode);
   27042         2318 :   emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
   27043         2318 :   return ret;
   27044              : }
   27045              : 
   27046              : rtx
   27047        65576 : ix86_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
   27048              :                         rtx_code code, tree treeop0, tree treeop1)
   27049              : {
   27050        65576 :   if (!TARGET_APX_CCMP)
   27051              :     return NULL_RTX;
   27052              : 
   27053        65576 :   rtx op0, op1, res;
   27054        65576 :   machine_mode op_mode;
   27055              : 
   27056        65576 :   start_sequence ();
   27057        65576 :   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
   27058              : 
   27059        65576 :   op_mode = GET_MODE (op0);
   27060        65576 :   if (op_mode == VOIDmode)
   27061            0 :     op_mode = GET_MODE (op1);
   27062              : 
   27063              :   /* We only supports following scalar comparisons that use just 1
   27064              :      instruction: DI/SI/QI/HI/DF/SF/HF.
   27065              :      Unordered/Ordered compare cannot be corretly indentified by
   27066              :      ccmp so they are not supported.  */
   27067        98348 :   if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
   27068        65576 :         || op_mode == QImode || op_mode == DFmode || op_mode == SFmode
   27069        32772 :         || op_mode == HFmode)
   27070        32806 :       || code == ORDERED
   27071        32806 :       || code == UNORDERED)
   27072              :     {
   27073        32770 :       end_sequence ();
   27074        32770 :       return NULL_RTX;
   27075              :     }
   27076              : 
   27077              :   /* Canonicalize the operands according to mode.  */
   27078        32806 :   if (SCALAR_INT_MODE_P (op_mode))
   27079              :     {
   27080        32799 :       if (!nonimmediate_operand (op0, op_mode))
   27081            0 :         op0 = force_reg (op_mode, op0);
   27082        32799 :       if (!x86_64_general_operand (op1, op_mode))
   27083            0 :         op1 = force_reg (op_mode, op1);
   27084              :     }
   27085              :   else
   27086              :     {
   27087              :       /* op0/op1 can be canonicallized from expand_fp_compare, so
   27088              :          just adjust the code to make it generate supported fp
   27089              :          condition.  */
   27090            7 :       if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
   27091              :         {
   27092              :           /* First try to split condition if we don't need to honor
   27093              :              NaNs, as the ORDERED/UNORDERED check always fall
   27094              :              through.  */
   27095            6 :           if (!HONOR_NANS (op_mode))
   27096              :             {
   27097            6 :               rtx_code first_code;
   27098            6 :               split_comparison (code, op_mode, &first_code, &code);
   27099              :             }
   27100              :           /* Otherwise try to swap the operand order and check if
   27101              :              the comparison is supported.  */
   27102              :           else
   27103              :             {
   27104            0 :               code = swap_condition (code);
   27105            0 :               std::swap (op0, op1);
   27106              :             }
   27107              : 
   27108            6 :           if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
   27109              :             {
   27110            0 :               end_sequence ();
   27111            0 :               return NULL_RTX;
   27112              :             }
   27113              :         }
   27114              :     }
   27115              : 
   27116        32806 :   *prep_seq = end_sequence ();
   27117              : 
   27118        32806 :   start_sequence ();
   27119              : 
   27120        32806 :   res = ix86_expand_compare (code, op0, op1);
   27121              : 
   27122        32806 :   if (!res)
   27123              :     {
   27124              :       end_sequence ();
   27125              :       return NULL_RTX;
   27126              :     }
   27127        32806 :   *gen_seq = end_sequence ();
   27128              : 
   27129        32806 :   return res;
   27130              : }
   27131              : 
   27132              : rtx
   27133        32809 : ix86_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
   27134              :                        rtx_code cmp_code, tree treeop0, tree treeop1,
   27135              :                        rtx_code bit_code)
   27136              : {
   27137        32809 :   if (!TARGET_APX_CCMP)
   27138              :     return NULL_RTX;
   27139              : 
   27140        32809 :   rtx op0, op1, target;
   27141        32809 :   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
   27142        32809 :   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
   27143        32809 :   insn_code icode;
   27144        32809 :   rtx_code prev_code;
   27145        32809 :   struct expand_operand ops[5];
   27146        32809 :   int dfv;
   27147              : 
   27148              :   /* Exit early for non integer modes to avoid O(n^2) part of expand_operands. */
   27149        32809 :   cmp_mode = op_mode = TYPE_MODE (TREE_TYPE (treeop0));
   27150              : 
   27151        32809 :   if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
   27152              :         || op_mode == QImode))
   27153              :     return NULL_RTX;
   27154              : 
   27155           32 :   push_to_sequence (*prep_seq);
   27156           32 :   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
   27157              : 
   27158           32 :   icode = code_for_ccmp (op_mode);
   27159              : 
   27160           32 :   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
   27161           32 :   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
   27162           32 :   if (!op0 || !op1)
   27163              :     {
   27164            0 :       end_sequence ();
   27165            0 :       return NULL_RTX;
   27166              :     }
   27167              : 
   27168           32 :   *prep_seq = end_sequence ();
   27169              : 
   27170           32 :   target = gen_rtx_REG (cc_mode, FLAGS_REG);
   27171           32 :   dfv = ix86_get_flags_cc ((rtx_code) cmp_code);
   27172              : 
   27173           32 :   prev_code = GET_CODE (prev);
   27174              :   /* Fixup FP compare code here.  */
   27175           32 :   if (GET_MODE (XEXP (prev, 0)) == CCFPmode)
   27176            7 :     prev_code = ix86_fp_compare_code_to_integer (prev_code);
   27177              : 
   27178           32 :   if (bit_code != AND)
   27179           17 :     prev_code = reverse_condition (prev_code);
   27180              :   else
   27181           15 :     dfv = (int)(dfv ^ 1);
   27182              : 
   27183           32 :   prev = gen_rtx_fmt_ee (prev_code, VOIDmode, XEXP (prev, 0),
   27184              :                          const0_rtx);
   27185              : 
   27186           32 :   create_fixed_operand (&ops[0], target);
   27187           32 :   create_fixed_operand (&ops[1], prev);
   27188           32 :   create_fixed_operand (&ops[2], op0);
   27189           32 :   create_fixed_operand (&ops[3], op1);
   27190           32 :   create_fixed_operand (&ops[4], GEN_INT (dfv));
   27191              : 
   27192           32 :   push_to_sequence (*gen_seq);
   27193           32 :   if (!maybe_expand_insn (icode, 5, ops))
   27194              :     {
   27195            0 :       end_sequence ();
   27196            0 :       return NULL_RTX;
   27197              :     }
   27198              : 
   27199           32 :   *gen_seq = end_sequence ();
   27200              : 
   27201           32 :   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
   27202              : }
   27203              : 
   27204              : /* Attempt to convert a CONST_VECTOR into a bcst_mem_operand.
   27205              :    Returns NULL_RTX if X is cannot be expressed as a suitable
   27206              :    VEC_DUPLICATE in mode MODE.  */
   27207              : 
   27208              : static rtx
   27209           48 : ix86_gen_bcst_mem (machine_mode mode, rtx x)
   27210              : {
   27211           48 :   if (!TARGET_AVX512F
   27212           48 :       || !CONST_VECTOR_P (x)
   27213           64 :       || (!TARGET_AVX512VL && GET_MODE_SIZE (mode) != 64)
   27214          147 :       || !VALID_BCST_MODE_P (GET_MODE_INNER (mode))
   27215              :          /* Disallow HFmode broadcast.  */
   27216          126 :       || GET_MODE_SIZE (GET_MODE_INNER (mode)) < 4)
   27217              :     return NULL_RTX;
   27218              : 
   27219           21 :   rtx cst = CONST_VECTOR_ELT (x, 0);
   27220           21 :   if (!CONST_SCALAR_INT_P (cst)
   27221           15 :       && !CONST_DOUBLE_P (cst)
   27222            0 :       && !CONST_FIXED_P (cst))
   27223              :     return NULL_RTX;
   27224              : 
   27225           21 :   int n_elts = GET_MODE_NUNITS (mode);
   27226           42 :   if (CONST_VECTOR_NUNITS (x) != n_elts)
   27227              :     return NULL_RTX;
   27228              : 
   27229          150 :   for (int i = 1; i < n_elts; i++)
   27230          129 :     if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i)))
   27231              :       return NULL_RTX;
   27232              : 
   27233           42 :   rtx mem = force_const_mem (GET_MODE_INNER (mode), cst);
   27234           21 :   return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem));
   27235              : }
   27236              : 
   27237              : /* Determine the ternlog immediate index that implements 3-operand
   27238              :    ternary logic expression OP.  This uses and modifies the 3 element
   27239              :    array ARGS to record and check the leaves, either 3 REGs, or 2 REGs
   27240              :    and MEM.  Returns an index between 0 and 255 for a valid ternlog,
   27241              :    or -1 if the expression isn't suitable.  */
   27242              : 
   27243              : int
   27244      7282473 : ix86_ternlog_idx (rtx op, rtx *args)
   27245              : {
   27246      7282473 :   int idx0, idx1;
   27247              : 
   27248      7282473 :   if (!op)
   27249              :     return -1;
   27250              : 
   27251      7282473 :   switch (GET_CODE (op))
   27252              :     {
   27253       751339 :     case SUBREG:
   27254       751339 :       if (!register_operand (op, GET_MODE (op)))
   27255              :         return -1;
   27256              :       /* FALLTHRU */
   27257              : 
   27258      3567619 :     case REG:
   27259      3567619 :       if (!args[0])
   27260              :         {
   27261      1848745 :           args[0] = op;
   27262      1848745 :           return 0xf0;
   27263              :         }
   27264      1718874 :       if (rtx_equal_p (op, args[0]))
   27265              :         return 0xf0;
   27266      1692241 :       if (!args[1])
   27267              :         {
   27268      1426444 :           args[1] = op;
   27269      1426444 :           return 0xcc;
   27270              :         }
   27271       265797 :       if (rtx_equal_p (op, args[1]))
   27272              :         return 0xcc;
   27273       249217 :       if (!args[2])
   27274              :         {
   27275       226621 :           args[2] = op;
   27276       226621 :           return 0xaa;
   27277              :         }
   27278        22596 :       if (rtx_equal_p (op, args[2]))
   27279              :         return 0xaa;
   27280              :       return -1;
   27281              : 
   27282        18451 :     case VEC_DUPLICATE:
   27283        18451 :       if (!bcst_mem_operand (op, GET_MODE (op)))
   27284              :         return -1;
   27285          302 :       goto do_mem_operand;
   27286              : 
   27287       363580 :     case MEM:
   27288       363580 :       if (!memory_operand (op, GET_MODE (op)))
   27289              :         return -1;
   27290       363416 :       if (MEM_P (op)
   27291       363416 :           && MEM_VOLATILE_P (op)
   27292       363510 :           && !volatile_ok)
   27293              :         return -1;
   27294              :       /* FALLTHRU */
   27295              : 
   27296       471644 :     case CONST_VECTOR:
   27297       471644 : do_mem_operand:
   27298       471644 :       if (!args[2])
   27299              :         {
   27300       424466 :           args[2] = op;
   27301       424466 :           return 0xaa;
   27302              :         }
   27303              :       /* Maximum of one volatile memory reference per expression.  */
   27304        47178 :       if (side_effects_p (op))
   27305              :         return -1;
   27306        47178 :       if (rtx_equal_p (op, args[2]))
   27307              :         return 0xaa;
   27308              :       /* Check if CONST_VECTOR is the ones-complement of args[2].  */
   27309        47127 :       if (CONST_VECTOR_P (op)
   27310         3421 :           && CONST_VECTOR_P (args[2])
   27311        47372 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27312          245 :                                                           op, GET_MODE (op)),
   27313              :                           args[2]))
   27314              :         return 0x55;
   27315        46940 :       if (!args[0])
   27316              :         {
   27317        45138 :           args[0] = op;
   27318        45138 :           return 0xf0;
   27319              :         }
   27320         1802 :       if (rtx_equal_p (op, args[0]))
   27321              :         return 0xf0;
   27322              :       /* Check if CONST_VECTOR is the ones-complement of args[0].  */
   27323         1802 :       if (CONST_VECTOR_P (op)
   27324          101 :           && CONST_VECTOR_P (args[0])
   27325         1844 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27326           42 :                                                           op, GET_MODE (op)),
   27327              :                           args[0]))
   27328              :         return 0x0f;
   27329         1760 :       if (!args[1])
   27330              :         {
   27331         1748 :           args[1] = op;
   27332         1748 :           return 0xcc;
   27333              :         }
   27334           12 :       if (rtx_equal_p (op, args[1]))
   27335              :         return 0xcc;
   27336              :       /* Check if CONST_VECTOR is the ones-complement of args[1].  */
   27337           12 :       if (CONST_VECTOR_P (op)
   27338            0 :           && CONST_VECTOR_P (args[1])
   27339           12 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27340            0 :                                                           op, GET_MODE (op)),
   27341              :                           args[1]))
   27342              :         return 0x33;
   27343              :       return -1;
   27344              : 
   27345       186535 :     case NOT:
   27346       186535 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27347       186535 :       return (idx0 >= 0) ? idx0 ^ 0xff : -1;
   27348              : 
   27349      1295489 :     case AND:
   27350      1295489 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27351      1295489 :       if (idx0 < 0)
   27352              :         return -1;
   27353      1067102 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27354      1067102 :       return (idx1 >= 0) ? idx0 & idx1 : -1;
   27355              : 
   27356       956695 :     case IOR:
   27357       956695 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27358       956695 :       if (idx0 < 0)
   27359              :         return -1;
   27360       711658 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27361       711658 :       return (idx1 >= 0) ? idx0 | idx1 : -1;
   27362              : 
   27363       405401 :     case XOR:
   27364       405401 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27365       405401 :       if (idx0 < 0)
   27366              :         return -1;
   27367       385728 :       if (vector_all_ones_operand (XEXP (op, 1), GET_MODE (op)))
   27368         6726 :         return idx0 ^ 0xff;
   27369       379002 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27370       379002 :       return (idx1 >= 0) ? idx0 ^ idx1 : -1;
   27371              : 
   27372         7498 :     case UNSPEC:
   27373         7498 :       if (XINT (op, 1) != UNSPEC_VTERNLOG
   27374            0 :           || XVECLEN (op, 0) != 4
   27375            0 :           || !CONST_INT_P (XVECEXP (op, 0, 3)))
   27376              :         return -1;
   27377              : 
   27378              :       /* TODO: Handle permuted operands.  */
   27379            0 :       if (ix86_ternlog_idx (XVECEXP (op, 0, 0), args) != 0xf0
   27380            0 :           || ix86_ternlog_idx (XVECEXP (op, 0, 1), args) != 0xcc
   27381            0 :           || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa)
   27382            0 :         return -1;
   27383            0 :       return INTVAL (XVECEXP (op, 0, 3));
   27384              : 
   27385              :     default:
   27386              :       return -1;
   27387              :     }
   27388              : }
   27389              : 
   27390              : /* Return TRUE if OP (in mode MODE) is the leaf of a ternary logic
   27391              :    expression, such as a register or a memory reference.  */
   27392              : 
   27393              : bool
   27394      3375006 : ix86_ternlog_leaf_p (rtx op, machine_mode mode)
   27395              : {
   27396              :   /* We can't use memory_operand here, as it may return a different
   27397              :      value before and after reload (for volatile MEMs) which creates
   27398              :      problems splitting instructions.  */
   27399      3375006 :   return register_operand (op, mode)
   27400       734365 :          || MEM_P (op)
   27401       384826 :          || CONST_VECTOR_P (op)
   27402      3657777 :          || bcst_mem_operand (op, mode);
   27403              : }
   27404              : 
   27405              : /* Test whether OP is a 3-operand ternary logic expression suitable
   27406              :    for use in a ternlog instruction.  */
   27407              : 
   27408              : bool
   27409      2244378 : ix86_ternlog_operand_p (rtx op)
   27410              : {
   27411      2244378 :   rtx op0, op1;
   27412      2244378 :   rtx args[3];
   27413              : 
   27414      2244378 :   args[0] = NULL_RTX;
   27415      2244378 :   args[1] = NULL_RTX;
   27416      2244378 :   args[2] = NULL_RTX;
   27417      2244378 :   int idx = ix86_ternlog_idx (op, args);
   27418      2244378 :   if (idx < 0)
   27419              :     return false;
   27420              : 
   27421              :   /* Don't match simple (binary or unary) expressions.  */
   27422      1824141 :   machine_mode mode = GET_MODE (op);
   27423      1824141 :   switch (GET_CODE (op))
   27424              :     {
   27425       837736 :     case AND:
   27426       837736 :       op0 = XEXP (op, 0);
   27427       837736 :       op1 = XEXP (op, 1);
   27428              : 
   27429              :       /* Prefer pand.  */
   27430       837736 :       if (ix86_ternlog_leaf_p (op0, mode)
   27431       837736 :           && ix86_ternlog_leaf_p (op1, mode))
   27432              :         return false;
   27433              :       /* Prefer pandn.  */
   27434       109360 :       if (GET_CODE (op0) == NOT
   27435        77778 :           && register_operand (XEXP (op0, 0), mode)
   27436       183516 :           && ix86_ternlog_leaf_p (op1, mode))
   27437              :         return false;
   27438              :       break;
   27439              : 
   27440       624658 :     case IOR:
   27441              :       /* Prefer por.  */
   27442       624658 :       if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
   27443       624658 :           && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
   27444              :         return false;
   27445              :       break;
   27446              : 
   27447       328561 :     case XOR:
   27448       328561 :       op1 = XEXP (op, 1);
   27449              :       /* Prefer pxor, or one_cmpl<vmode>2.  */
   27450       328561 :       if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
   27451       328561 :           && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
   27452              :         return false;
   27453              :       break;
   27454              : 
   27455              :     default:
   27456              :       break;
   27457              :     }
   27458              :   return true;
   27459              : }
   27460              : 
   27461              : /* Helper function for ix86_expand_ternlog.  */
   27462              : static rtx
   27463            0 : ix86_expand_ternlog_binop (enum rtx_code code, machine_mode mode,
   27464              :                            rtx op0, rtx op1, rtx target)
   27465              : {
   27466            0 :   if (GET_MODE (op0) != mode)
   27467            0 :     op0 = gen_lowpart (mode, op0);
   27468            0 :   if (GET_MODE (op1) != mode)
   27469            0 :     op1 = gen_lowpart (mode, op1);
   27470              : 
   27471            0 :   if (CONST_VECTOR_P (op0))
   27472            0 :     op0 = validize_mem (force_const_mem (mode, op0));
   27473            0 :   if (CONST_VECTOR_P (op1))
   27474            0 :     op1 = validize_mem (force_const_mem (mode, op1));
   27475              : 
   27476            0 :   if (!register_operand (op0, mode))
   27477              :     {
   27478            0 :       if (!register_operand (op1, mode))
   27479              :         {
   27480              :           /* We can't use force_reg (op0, mode).  */
   27481            0 :           rtx reg = gen_reg_rtx (mode);
   27482            0 :           emit_move_insn (reg, op0);
   27483            0 :           op0 = reg;
   27484              :         }
   27485              :       else
   27486              :         std::swap (op0, op1);
   27487              :     }
   27488            0 :   rtx ops[3] = { target, op0, op1 };
   27489            0 :   ix86_expand_vector_logical_operator (code, mode, ops);
   27490            0 :   return target;
   27491              : }
   27492              : 
   27493              : 
   27494              : /* Helper function for ix86_expand_ternlog.  */
   27495              : static rtx
   27496            0 : ix86_expand_ternlog_andnot (machine_mode mode, rtx op0, rtx op1, rtx target)
   27497              : {
   27498            0 :   if (GET_MODE (op0) != mode)
   27499            0 :     op0 = gen_lowpart (mode, op0);
   27500            0 :   op0 = gen_rtx_NOT (mode, op0);
   27501            0 :   if (GET_MODE (op1) != mode)
   27502            0 :     op1 = gen_lowpart (mode, op1);
   27503            0 :   if (CONST_VECTOR_P (op1))
   27504            0 :     op1 = validize_mem (force_const_mem (mode, op1));
   27505            0 :   emit_move_insn (target, gen_rtx_AND (mode, op0, op1));
   27506            0 :   return target;
   27507              : }
   27508              : 
   27509              : /* Expand a 3-operand ternary logic expression.  Return TARGET. */
   27510              : rtx
   27511         2429 : ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
   27512              :                      rtx target)
   27513              : {
   27514         2429 :   rtx tmp0, tmp1, tmp2;
   27515              : 
   27516         2429 :   if (!target)
   27517            3 :     target = gen_reg_rtx (mode);
   27518              : 
   27519              :   /* Canonicalize ternlog index for degenerate (duplicated) operands.  */
   27520         2429 :   if (rtx_equal_p (op0, op1) && rtx_equal_p (op0, op2))
   27521            0 :     switch (idx & 0x81)
   27522              :       {
   27523              :       case 0x00:
   27524              :         idx = 0x00;
   27525              :         break;
   27526              :       case 0x01:
   27527              :         idx = 0x0f;
   27528              :         break;
   27529              :       case 0x80:
   27530              :         idx = 0xf0;
   27531              :         break;
   27532              :       case 0x81:
   27533              :         idx = 0xff;
   27534              :         break;
   27535              :       }
   27536              : 
   27537         2429 :   switch (idx & 0xff)
   27538              :     {
   27539            0 :     case 0x00:
   27540            0 :       if ((!op0 || !side_effects_p (op0))
   27541            0 :           && (!op1 || !side_effects_p (op1))
   27542            0 :           && (!op2 || !side_effects_p (op2)))
   27543              :         {
   27544            0 :           emit_move_insn (target, CONST0_RTX (mode));
   27545            0 :           return target;
   27546              :         }
   27547              :       break;
   27548              : 
   27549            0 :     case 0x0a: /* ~a&c */
   27550            0 :       if ((!op1 || !side_effects_p (op1))
   27551            0 :           && op0 && register_operand (op0, mode)
   27552            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27553            0 :         return ix86_expand_ternlog_andnot (mode, op0, op2, target);
   27554              :       break;
   27555              : 
   27556            0 :     case 0x0c: /* ~a&b */
   27557            0 :       if ((!op2 || !side_effects_p (op2))
   27558            0 :           && op0 && register_operand (op0, mode)
   27559            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode))
   27560            0 :         return ix86_expand_ternlog_andnot (mode, op0, op1, target);
   27561              :       break;
   27562              : 
   27563           81 :     case 0x0f:  /* ~a */
   27564            0 :       if ((!op1 || !side_effects_p (op1))
   27565           81 :           && (!op2 || !side_effects_p (op2))
   27566          162 :           && op0)
   27567              :         {
   27568           81 :           emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode)));
   27569           81 :           return target;
   27570              :         }
   27571              :       break;
   27572              : 
   27573            0 :     case 0x22: /* ~b&c */
   27574            0 :       if ((!op0 || !side_effects_p (op0))
   27575            0 :           && op1 && register_operand (op1, mode)
   27576            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27577            0 :         return ix86_expand_ternlog_andnot (mode, op1, op2, target);
   27578              :       break;
   27579              : 
   27580            0 :     case 0x30: /* ~b&a */
   27581            0 :       if ((!op2 || !side_effects_p (op2))
   27582            0 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27583            0 :           && op1 && register_operand (op1, mode))
   27584            0 :         return ix86_expand_ternlog_andnot (mode, op1, op0, target);
   27585              :       break;
   27586              : 
   27587            0 :     case 0x33:  /* ~b */
   27588            0 :       if ((!op0 || !side_effects_p (op0))
   27589            0 :           && (!op2 || !side_effects_p (op2))
   27590            0 :           && op1)
   27591              :         {
   27592            0 :           emit_move_insn (target, gen_rtx_XOR (mode, op1, CONSTM1_RTX (mode)));
   27593            0 :           return target;
   27594              :         }
   27595              :       break;
   27596              : 
   27597            0 :     case 0x3c:  /* a^b */
   27598            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27599            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27600            0 :           && (!op2 || !side_effects_p (op2)))
   27601            0 :         return ix86_expand_ternlog_binop (XOR, mode, op0, op1, target);
   27602              :       break;
   27603              : 
   27604            0 :     case 0x44: /* ~c&b */
   27605            0 :       if ((!op0 || !side_effects_p (op0))
   27606            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27607            0 :           && op2 && register_operand (op2, mode))
   27608            0 :         return ix86_expand_ternlog_andnot (mode, op2, op1, target);
   27609              :       break;
   27610              : 
   27611            2 :     case 0x50: /* ~c&a */
   27612            0 :       if ((!op1 || !side_effects_p (op1))
   27613            2 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27614            4 :           && op2 && register_operand (op2, mode))
   27615            0 :         return ix86_expand_ternlog_andnot (mode, op2, op0, target);
   27616              :       break;
   27617              : 
   27618            4 :     case 0x55:  /* ~c */
   27619            1 :       if ((!op0 || !side_effects_p (op0))
   27620            4 :           && (!op1 || !side_effects_p (op1))
   27621            8 :           && op2)
   27622              :         {
   27623            4 :           emit_move_insn (target, gen_rtx_XOR (mode, op2, CONSTM1_RTX (mode)));
   27624            4 :           return target;
   27625              :         }
   27626              :       break;
   27627              : 
   27628            0 :     case 0x5a:  /* a^c */
   27629            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27630            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode)
   27631            0 :           && (!op1 || !side_effects_p (op1)))
   27632            0 :         return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target);
   27633              :       break;
   27634              : 
   27635            0 :     case 0x66:  /* b^c */
   27636            0 :       if ((!op0 || !side_effects_p (op0))
   27637            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27638            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27639            0 :         return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target);
   27640              :       break;
   27641              : 
   27642            0 :     case 0x88:  /* b&c */
   27643            0 :       if ((!op0 || !side_effects_p (op0))
   27644            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27645            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27646            0 :         return ix86_expand_ternlog_binop (AND, mode, op1, op2, target);
   27647              :       break;
   27648              : 
   27649            0 :     case 0xa0:  /* a&c */
   27650            0 :       if ((!op1 || !side_effects_p (op1))
   27651            0 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27652            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27653            0 :         return ix86_expand_ternlog_binop (AND, mode, op0, op2, target);
   27654              :       break;
   27655              : 
   27656            0 :     case 0xaa:  /* c */
   27657            0 :       if ((!op0 || !side_effects_p (op0))
   27658            0 :           && (!op1 || !side_effects_p (op1))
   27659            0 :           && op2)
   27660              :         {
   27661            0 :           if (GET_MODE (op2) != mode)
   27662            0 :             op2 = gen_lowpart (mode, op2);
   27663            0 :           emit_move_insn (target, op2);
   27664            0 :           return target;
   27665              :         }
   27666              :       break;
   27667              : 
   27668            0 :     case 0xc0:  /* a&b */
   27669            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27670            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27671            0 :           && (!op2 || !side_effects_p (op2)))
   27672            0 :         return ix86_expand_ternlog_binop (AND, mode, op0, op1, target);
   27673              :       break;
   27674              : 
   27675            0 :     case 0xcc:  /* b */
   27676            0 :       if ((!op0 || !side_effects_p (op0))
   27677            0 :           && op1
   27678            0 :           && (!op2 || !side_effects_p (op2)))
   27679              :         {
   27680            0 :           if (GET_MODE (op1) != mode)
   27681            0 :             op1 = gen_lowpart (mode, op1);
   27682            0 :           emit_move_insn (target, op1);
   27683            0 :           return target;
   27684              :         }
   27685              :       break;
   27686              : 
   27687            0 :     case 0xee:  /* b|c */
   27688            0 :       if ((!op0 || !side_effects_p (op0))
   27689            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27690            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27691            0 :         return ix86_expand_ternlog_binop (IOR, mode, op1, op2, target);
   27692              :       break;
   27693              : 
   27694            6 :     case 0xf0:  /* a */
   27695            6 :       if (op0
   27696            6 :           && (!op1 || !side_effects_p (op1))
   27697           12 :           && (!op2 || !side_effects_p (op2)))
   27698              :         {
   27699            6 :           if (GET_MODE (op0) != mode)
   27700            0 :             op0 = gen_lowpart (mode, op0);
   27701            6 :           emit_move_insn (target, op0);
   27702            6 :           return target;
   27703              :         }
   27704              :       break;
   27705              : 
   27706            0 :     case 0xfa:  /* a|c */
   27707            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27708            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode)
   27709            0 :           && (!op1 || !side_effects_p (op1)))
   27710            0 :         return ix86_expand_ternlog_binop (IOR, mode, op0, op2, target);
   27711              :       break;
   27712              : 
   27713            0 :     case 0xfc:  /* a|b */
   27714            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27715            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27716            0 :           && (!op2 || !side_effects_p (op2)))
   27717            0 :         return ix86_expand_ternlog_binop (IOR, mode, op0, op1, target);
   27718              :       break;
   27719              : 
   27720            0 :     case 0xff:
   27721            0 :       if ((!op0 || !side_effects_p (op0))
   27722            0 :           && (!op1 || !side_effects_p (op1))
   27723            0 :           && (!op2 || !side_effects_p (op2)))
   27724              :         {
   27725            0 :           emit_move_insn (target, CONSTM1_RTX (mode));
   27726            0 :           return target;
   27727              :         }
   27728              :       break;
   27729              :     }
   27730              : 
   27731         2338 :   if (!register_operand (op0, mode))
   27732              :     {
   27733              :       /* We can't use force_reg (mode, op0).  */
   27734           12 :       tmp0 = gen_reg_rtx (GET_MODE (op0));
   27735           12 :       emit_move_insn (tmp0,op0);
   27736              :     }
   27737              :   else
   27738              :     tmp0 = op0;
   27739         2338 :   if (GET_MODE (tmp0) != mode)
   27740            0 :     tmp0 = gen_lowpart (mode, tmp0);
   27741              : 
   27742         2338 :   if (!op1 || rtx_equal_p (op0, op1))
   27743            6 :     tmp1 = copy_rtx (tmp0);
   27744         2332 :   else if (!register_operand (op1, mode))
   27745              :     {
   27746              :       /* We can't use force_reg (mode, op1).  */
   27747           28 :       tmp1 = gen_reg_rtx (GET_MODE (op1));
   27748           28 :       emit_move_insn (tmp1, op1);
   27749              :     }
   27750              :   else
   27751              :     tmp1 = op1;
   27752         2338 :   if (GET_MODE (tmp1) != mode)
   27753            0 :     tmp1 = gen_lowpart (mode, tmp1);
   27754              : 
   27755         2338 :   if (!op2 || rtx_equal_p (op0, op2))
   27756           79 :     tmp2 = copy_rtx (tmp0);
   27757         2259 :   else if (rtx_equal_p (op1, op2))
   27758            0 :     tmp2 = copy_rtx (tmp1);
   27759         2259 :   else if (CONST_VECTOR_P (op2))
   27760              :     {
   27761           43 :       if (GET_MODE (op2) != mode)
   27762            0 :         op2 = gen_lowpart (mode, op2);
   27763           43 :       tmp2 = ix86_gen_bcst_mem (mode, op2);
   27764           43 :       if (!tmp2)
   27765              :         {
   27766           25 :           machine_mode bcst32_mode = mode;
   27767           25 :           machine_mode bcst64_mode = mode;
   27768           25 :           switch (mode)
   27769              :             {
   27770            1 :             case V1TImode:
   27771            1 :             case V4SImode:
   27772            1 :             case V4SFmode:
   27773            1 :             case V8HImode:
   27774            1 :             case V16QImode:
   27775            1 :               bcst32_mode = V4SImode;
   27776            1 :               bcst64_mode = V2DImode;
   27777            1 :               break;
   27778              : 
   27779            0 :             case V2TImode:
   27780            0 :             case V8SImode:
   27781            0 :             case V8SFmode:
   27782            0 :             case V16HImode:
   27783            0 :             case V32QImode:
   27784            0 :               bcst32_mode = V8SImode;
   27785            0 :               bcst64_mode = V4DImode;
   27786            0 :               break;
   27787              : 
   27788            3 :             case V4TImode:
   27789            3 :             case V16SImode:
   27790            3 :             case V16SFmode:
   27791            3 :             case V32HImode:
   27792            3 :             case V64QImode:
   27793            3 :               bcst32_mode = V16SImode;
   27794            3 :               bcst64_mode = V8DImode;
   27795            3 :               break;
   27796              : 
   27797              :             default:
   27798              :               break;
   27799              :             }
   27800              : 
   27801           25 :           if (bcst32_mode != mode)
   27802              :             {
   27803            4 :               tmp2 = gen_lowpart (bcst32_mode, op2);
   27804            4 :               if (ix86_gen_bcst_mem (bcst32_mode, tmp2))
   27805              :                 {
   27806            3 :                   tmp2 = ix86_expand_ternlog (bcst32_mode,
   27807            3 :                                               gen_lowpart (bcst32_mode, tmp0),
   27808            3 :                                               gen_lowpart (bcst32_mode, tmp1),
   27809              :                                               tmp2, idx, NULL_RTX);
   27810            3 :                   emit_move_insn (target, gen_lowpart (mode, tmp2));
   27811            3 :                   return target;
   27812              :                 }
   27813              :             }
   27814              : 
   27815           22 :           if (bcst64_mode != mode)
   27816              :             {
   27817            1 :               tmp2 = gen_lowpart (bcst64_mode, op2);
   27818            1 :               if (ix86_gen_bcst_mem (bcst64_mode, tmp2))
   27819              :                 {
   27820            0 :                   tmp2 = ix86_expand_ternlog (bcst64_mode,
   27821            0 :                                               gen_lowpart (bcst64_mode, tmp0),
   27822            0 :                                               gen_lowpart (bcst64_mode, tmp1),
   27823              :                                               tmp2, idx, NULL_RTX);
   27824            0 :                   emit_move_insn (target, gen_lowpart (mode, tmp2));
   27825            0 :                   return target;
   27826              :                 }
   27827              :             }
   27828              : 
   27829           22 :           tmp2 = force_const_mem (mode, op2);
   27830           22 :           rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
   27831           22 :           tmp2 = validize_mem (tmp2);
   27832           22 :           if (bcast)
   27833              :             {
   27834           12 :               rtx reg2 = gen_reg_rtx (mode);
   27835           12 :               bool ok = ix86_expand_vector_init_duplicate (false, mode,
   27836              :                                                            reg2, bcast);
   27837           12 :               if (ok)
   27838         2335 :                 tmp2 = reg2;
   27839              :             }
   27840              :         }
   27841              :     }
   27842              :   else
   27843              :     tmp2 = op2;
   27844         2335 :   if (GET_MODE (tmp2) != mode)
   27845            0 :     tmp2 = gen_lowpart (mode, tmp2);
   27846              :   /* Some memory_operands are not vector_memory_operands.  */
   27847         2335 :   if (!bcst_vector_operand (tmp2, mode))
   27848            0 :     tmp2 = force_reg (mode, tmp2);
   27849              : 
   27850         2335 :   rtvec vec = gen_rtvec (4, tmp0, tmp1, tmp2, GEN_INT (idx));
   27851         2335 :   emit_move_insn (target, gen_rtx_UNSPEC (mode, vec, UNSPEC_VTERNLOG));
   27852         2335 :   return target;
   27853              : }
   27854              : 
   27855              : /* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
   27856              : 
   27857              : static const uint64_t matrix_ashift[8] =
   27858              : {
   27859              :   0,
   27860              :   0x0001020408102040, /* 1 l */
   27861              :   0x0000010204081020, /* 2 l */
   27862              :   0x0000000102040810, /* 3 l */
   27863              :   0x0000000001020408, /* 4 l */
   27864              :   0x0000000000010204, /* 5 l */
   27865              :   0x0000000000000102, /* 6 l */
   27866              :   0x0000000000000001  /* 7 l */
   27867              : };
   27868              : 
   27869              : static const uint64_t matrix_lshiftrt[8] =
   27870              : {
   27871              :   0,
   27872              :   0x0204081020408000, /* 1 r */
   27873              :   0x0408102040800000, /* 2 r */
   27874              :   0x0810204080000000, /* 3 r */
   27875              :   0x1020408000000000, /* 4 r */
   27876              :   0x2040800000000000, /* 5 r */
   27877              :   0x4080000000000000, /* 6 r */
   27878              :   0x8000000000000000  /* 7 r */
   27879              : };
   27880              : 
   27881              : static const uint64_t matrix_ashiftrt[8] =
   27882              : {
   27883              :   0,
   27884              :   0x0204081020408080, /* 1 r */
   27885              :   0x0408102040808080, /* 2 r */
   27886              :   0x0810204080808080, /* 3 r */
   27887              :   0x1020408080808080, /* 4 r */
   27888              :   0x2040808080808080, /* 5 r */
   27889              :   0x4080808080808080, /* 6 r */
   27890              :   0x8080808080808080  /* 7 r */
   27891              : };
   27892              : 
   27893              : static const uint64_t matrix_rotate[8] =
   27894              : {
   27895              :   0,
   27896              :   0x8001020408102040, /* 1 rol8 */
   27897              :   0x4080010204081020, /* 2 rol8 */
   27898              :   0x2040800102040810, /* 3 rol8 */
   27899              :   0x1020408001020408, /* 4 rol8 */
   27900              :   0x0810204080010204, /* 5 rol8 */
   27901              :   0x0408102040800102, /* 6 rol8 */
   27902              :   0x0204081020408001  /* 7 rol8 */
   27903              : };
   27904              : 
   27905              : static const uint64_t matrix_rotatert[8] =
   27906              : {
   27907              :   0,
   27908              :   0x0204081020408001, /* 1 ror8 */
   27909              :   0x0408102040800102, /* 2 ror8 */
   27910              :   0x0810204080010204, /* 3 ror8 */
   27911              :   0x1020408001020408, /* 4 ror8 */
   27912              :   0x2040800102040810, /* 5 ror8 */
   27913              :   0x4080010204081020, /* 6 ror8 */
   27914              :   0x8001020408102040  /* 7 ror8 */
   27915              : };
   27916              : 
   27917              : /* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
   27918              :    for CODE and shift count COUNT into register with vector of size of SRC.  */
   27919              : 
   27920              : rtx
   27921          189 : ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
   27922              : {
   27923          189 :   machine_mode mode = GET_MODE (src);
   27924          189 :   const uint64_t *matrix;
   27925          189 :   unsigned shift = INTVAL (count) & 7;
   27926          189 :   gcc_assert (shift > 0 && shift < 8);
   27927              : 
   27928          189 :   switch (code)
   27929              :     {
   27930              :     case ASHIFT:
   27931              :       matrix = matrix_ashift;
   27932              :       break;
   27933           26 :     case ASHIFTRT:
   27934           26 :       matrix = matrix_ashiftrt;
   27935           26 :       break;
   27936           28 :     case LSHIFTRT:
   27937           28 :       matrix = matrix_lshiftrt;
   27938           28 :       break;
   27939           32 :     case ROTATE:
   27940           32 :       matrix = matrix_rotate;
   27941           32 :       break;
   27942           33 :     case ROTATERT:
   27943           33 :       matrix = matrix_rotatert;
   27944           33 :       break;
   27945            0 :     default:
   27946            0 :       gcc_unreachable ();
   27947              :     }
   27948              : 
   27949          189 :   int nelts = GET_MODE_NUNITS (mode);
   27950          189 :   rtvec vec = rtvec_alloc (nelts);
   27951          189 :   uint64_t ma = matrix[shift];
   27952         7741 :   for (int i = 0; i < nelts; i++)
   27953         7552 :     RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
   27954              : 
   27955          189 :   return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
   27956              : }
   27957              : 
   27958              : /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
   27959              : 
   27960              : void
   27961           63 : ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode)
   27962              : {
   27963           63 :   machine_mode out_mode = GET_MODE (output);
   27964           63 :   machine_mode in_mode = GET_MODE (input);
   27965           63 :   int len = GET_MODE_SIZE (in_mode);
   27966          252 :   gcc_assert (len == GET_MODE_SIZE (cvt_mode)
   27967              :               && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
   27968              :               && (REG_P (input) || SUBREG_P (input)));
   27969           63 :   scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
   27970          126 :   int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
   27971           63 :   int out_innersize = GET_MODE_SIZE (inner_out_mode);
   27972              : 
   27973           63 :   struct expand_vec_perm_d d;
   27974           63 :   d.target = gen_reg_rtx (cvt_mode);
   27975           63 :   d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
   27976           63 :   d.op1 = d.op0;
   27977           63 :   d.vmode = cvt_mode;
   27978           63 :   d.nelt = GET_MODE_NUNITS (cvt_mode);
   27979           63 :   d.testing_p = false;
   27980           63 :   d.one_operand_p = true;
   27981              : 
   27982              :   /* Init perm. Put the needed bits of input in order and
   27983              :      fill the rest of bits by default.  */
   27984          687 :   for (int i = 0; i < d.nelt; ++i)
   27985              :     {
   27986          624 :       d.perm[i] = i;
   27987         1248 :       if (i < GET_MODE_NUNITS (out_mode))
   27988          246 :         d.perm[i] = i * (in_innersize / out_innersize);
   27989              :     }
   27990              : 
   27991           63 :   bool ok = ix86_expand_vec_perm_const_1(&d);
   27992           63 :   gcc_assert (ok);
   27993           63 :   emit_move_insn (output, gen_lowpart (out_mode, d.target));
   27994           63 : }
   27995              : 
   27996              : /* Implement truncv8sfv8bf2 with vector permutation.  */
   27997              : void
   27998            8 : ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
   27999              : {
   28000            8 :   machine_mode vperm_mode, src_mode = GET_MODE (src);
   28001            8 :   switch (src_mode)
   28002              :     {
   28003              :     case V16SFmode:
   28004              :       vperm_mode = V32BFmode;
   28005              :       break;
   28006            2 :     case V8SFmode:
   28007            2 :       vperm_mode = V16BFmode;
   28008            2 :       break;
   28009            4 :     case V4SFmode:
   28010            4 :       vperm_mode = V8BFmode;
   28011            4 :       break;
   28012            0 :     default:
   28013            0 :       gcc_unreachable ();
   28014              :     }
   28015              : 
   28016            8 :   int nelt = GET_MODE_NUNITS (vperm_mode);
   28017            8 :   vec_perm_builder sel (nelt, nelt, 1);
   28018            8 :   sel.quick_grow (nelt);
   28019          136 :   for (int i = 0; i != nelt; i++)
   28020          128 :     sel[i] = (2 * i + 1) % nelt;
   28021           16 :   vec_perm_indices indices (sel, 1, nelt);
   28022              : 
   28023            8 :   rtx target = gen_reg_rtx (vperm_mode);
   28024            8 :   rtx op0 = lowpart_subreg (vperm_mode,
   28025              :                             force_reg (src_mode, src),
   28026              :                             src_mode);
   28027            8 :   bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
   28028              :                                               target, op0, op0, indices);
   28029            8 :   gcc_assert (ok);
   28030            8 :   emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
   28031            8 : }
   28032              : 
   28033              : /* Implement extendv8bf2v8sf2 with vector permutation.  */
   28034              : void
   28035            8 : ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src)
   28036              : {
   28037            8 :   machine_mode vperm_mode, src_mode = GET_MODE (src);
   28038            8 :   switch (src_mode)
   28039              :     {
   28040              :     case V16BFmode:
   28041              :       vperm_mode = V32BFmode;
   28042              :       break;
   28043            2 :     case V8BFmode:
   28044            2 :       vperm_mode = V16BFmode;
   28045            2 :       break;
   28046            4 :     case V4BFmode:
   28047            4 :       vperm_mode = V8BFmode;
   28048            4 :       break;
   28049            0 :     default:
   28050            0 :       gcc_unreachable ();
   28051              :     }
   28052              : 
   28053            8 :   int nelt = GET_MODE_NUNITS (vperm_mode);
   28054            8 :   vec_perm_builder sel (nelt, nelt, 1);
   28055            8 :   sel.quick_grow (nelt);
   28056          136 :   for (int i = 0, k = 0, j = nelt; i != nelt; i++)
   28057          128 :     sel[i] = i & 1 ? j++ : k++;
   28058              : 
   28059           16 :   vec_perm_indices indices (sel, 2, nelt);
   28060              : 
   28061            8 :   rtx target = gen_reg_rtx (vperm_mode);
   28062            8 :   rtx op1 = lowpart_subreg (vperm_mode,
   28063              :                             force_reg (src_mode, src),
   28064              :                             src_mode);
   28065            8 :   rtx op0 = CONST0_RTX (vperm_mode);
   28066            8 :   bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
   28067              :                                               target, op0, op1, indices);
   28068            8 :   gcc_assert (ok);
   28069            8 :   emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
   28070            8 : }
   28071              : 
   28072              : 
   28073              : #include "gt-i386-expand.h"
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.