LCOV - code coverage report
Current view: top level - gcc/config/i386 - i386-expand.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 87.1 % 15147 13186
Test Date: 2026-05-30 15:37:04 Functions: 93.7 % 271 254
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
       2              : 
       3              : This file is part of GCC.
       4              : 
       5              : GCC is free software; you can redistribute it and/or modify
       6              : it under the terms of the GNU General Public License as published by
       7              : the Free Software Foundation; either version 3, or (at your option)
       8              : any later version.
       9              : 
      10              : GCC is distributed in the hope that it will be useful,
      11              : but WITHOUT ANY WARRANTY; without even the implied warranty of
      12              : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13              : GNU General Public License for more details.
      14              : 
      15              : You should have received a copy of the GNU General Public License
      16              : along with GCC; see the file COPYING3.  If not see
      17              : <http://www.gnu.org/licenses/>.  */
      18              : 
      19              : #define IN_TARGET_CODE 1
      20              : 
      21              : #include "config.h"
      22              : #include "system.h"
      23              : #include "coretypes.h"
      24              : #include "backend.h"
      25              : #include "rtl.h"
      26              : #include "tree.h"
      27              : #include "memmodel.h"
      28              : #include "gimple.h"
      29              : #include "cfghooks.h"
      30              : #include "cfgloop.h"
      31              : #include "df.h"
      32              : #include "tm_p.h"
      33              : #include "stringpool.h"
      34              : #include "expmed.h"
      35              : #include "optabs.h"
      36              : #include "regs.h"
      37              : #include "emit-rtl.h"
      38              : #include "recog.h"
      39              : #include "cgraph.h"
      40              : #include "diagnostic.h"
      41              : #include "cfgbuild.h"
      42              : #include "alias.h"
      43              : #include "fold-const.h"
      44              : #include "attribs.h"
      45              : #include "calls.h"
      46              : #include "stor-layout.h"
      47              : #include "varasm.h"
      48              : #include "output.h"
      49              : #include "insn-attr.h"
      50              : #include "flags.h"
      51              : #include "except.h"
      52              : #include "explow.h"
      53              : #include "expr.h"
      54              : #include "cfgrtl.h"
      55              : #include "common/common-target.h"
      56              : #include "langhooks.h"
      57              : #include "reload.h"
      58              : #include "gimplify.h"
      59              : #include "dwarf2.h"
      60              : #include "tm-constrs.h"
      61              : #include "cselib.h"
      62              : #include "sched-int.h"
      63              : #include "opts.h"
      64              : #include "tree-pass.h"
      65              : #include "context.h"
      66              : #include "pass_manager.h"
      67              : #include "target-globals.h"
      68              : #include "gimple-iterator.h"
      69              : #include "shrink-wrap.h"
      70              : #include "builtins.h"
      71              : #include "rtl-iter.h"
      72              : #include "tree-iterator.h"
      73              : #include "dbgcnt.h"
      74              : #include "case-cfn-macros.h"
      75              : #include "dojump.h"
      76              : #include "fold-const-call.h"
      77              : #include "tree-vrp.h"
      78              : #include "tree-ssanames.h"
      79              : #include "selftest.h"
      80              : #include "selftest-rtl.h"
      81              : #include "print-rtl.h"
      82              : #include "intl.h"
      83              : #include "ifcvt.h"
      84              : #include "symbol-summary.h"
      85              : #include "sreal.h"
      86              : #include "ipa-cp.h"
      87              : #include "ipa-prop.h"
      88              : #include "ipa-fnsummary.h"
      89              : #include "wide-int-bitmask.h"
      90              : #include "tree-vector-builder.h"
      91              : #include "debug.h"
      92              : #include "dwarf2out.h"
      93              : #include "i386-options.h"
      94              : #include "i386-builtins.h"
      95              : #include "i386-expand.h"
      96              : #include "asan.h"
      97              : 
      98              : /* Split one or more double-mode RTL references into pairs of half-mode
      99              :    references.  The RTL can be REG, offsettable MEM, integer constant, or
     100              :    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
     101              :    split and "num" is its length.  lo_half and hi_half are output arrays
     102              :    that parallel "operands".  */
     103              : 
     104              : void
     105      4166852 : split_double_mode (machine_mode mode, rtx operands[],
     106              :                    int num, rtx lo_half[], rtx hi_half[])
     107              : {
     108      4166852 :   machine_mode half_mode;
     109      4166852 :   unsigned int byte;
     110      4166852 :   rtx mem_op = NULL_RTX;
     111      4166852 :   int mem_num = 0;
     112              : 
     113      4166852 :   switch (mode)
     114              :     {
     115              :     case E_TImode:
     116              :       half_mode = DImode;
     117              :       break;
     118       605892 :     case E_DImode:
     119       605892 :       half_mode = SImode;
     120       605892 :       break;
     121            6 :     case E_P2HImode:
     122            6 :       half_mode = HImode;
     123            6 :       break;
     124           30 :     case E_P2QImode:
     125           30 :       half_mode = QImode;
     126           30 :       break;
     127            0 :     default:
     128            0 :       gcc_unreachable ();
     129              :     }
     130              : 
     131      4166852 :   byte = GET_MODE_SIZE (half_mode);
     132              : 
     133      8544254 :   while (num--)
     134              :     {
     135      4377402 :       rtx op = operands[num];
     136              : 
     137              :       /* simplify_subreg refuse to split volatile memory addresses,
     138              :          but we still have to handle it.  */
     139      4377402 :       if (MEM_P (op))
     140              :         {
     141      1742901 :           if (mem_op && rtx_equal_p (op, mem_op))
     142              :             {
     143         2426 :               lo_half[num] = lo_half[mem_num];
     144         2426 :               hi_half[num] = hi_half[mem_num];
     145              :             }
     146              :           else
     147              :             {
     148      1740475 :               mem_op = op;
     149      1740475 :               mem_num = num;
     150      1740475 :               lo_half[num] = adjust_address (op, half_mode, 0);
     151      1740475 :               hi_half[num] = adjust_address (op, half_mode, byte);
     152              :             }
     153              :         }
     154              :       else
     155              :         {
     156      2634501 :           lo_half[num] = simplify_gen_subreg (half_mode, op,
     157      2634501 :                                               GET_MODE (op) == VOIDmode
     158              :                                               ? mode : GET_MODE (op), 0);
     159              : 
     160      2634501 :           rtx tmp = simplify_gen_subreg (half_mode, op,
     161      2634501 :                                          GET_MODE (op) == VOIDmode
     162      2634501 :                                          ? mode : GET_MODE (op), byte);
     163              :           /* simplify_gen_subreg will return NULL RTX for the
     164              :              high half of the paradoxical subreg. */
     165      2634501 :           hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
     166              :         }
     167              :     }
     168      4166852 : }
     169              : 
     170              : /* Emit the double word assignment DST = { LO, HI }.  */
     171              : 
     172              : void
     173       101077 : split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
     174              : {
     175       101077 :   rtx dlo, dhi;
     176       101077 :   int deleted_move_count = 0;
     177       101077 :   split_double_mode (mode, &dst, 1, &dlo, &dhi);
     178              :   /* Constraints ensure that if both lo and hi are MEMs, then
     179              :      dst has early-clobber and thus addresses of MEMs don't use
     180              :      dlo/dhi registers.  Otherwise if at least one of li and hi are MEMs,
     181              :      dlo/dhi are registers.  */
     182       101077 :   if (MEM_P (lo)
     183         5558 :       && rtx_equal_p (dlo, hi)
     184       102054 :       && reg_overlap_mentioned_p (dhi, lo))
     185              :     {
     186              :       /* If dlo is same as hi and lo's address uses dhi register,
     187              :          code below would first emit_move_insn (dhi, hi)
     188              :          and then emit_move_insn (dlo, lo).  But the former
     189              :          would invalidate lo's address.  Load into dhi first,
     190              :          then swap.  */
     191          193 :       emit_move_insn (dhi, lo);
     192          193 :       lo = dhi;
     193              :     }
     194       100884 :   else if (MEM_P (hi)
     195         9414 :            && !MEM_P (lo)
     196         6597 :            && !rtx_equal_p (dlo, lo)
     197       102176 :            && reg_overlap_mentioned_p (dlo, hi))
     198              :     {
     199              :       /* In this case, code below would first emit_move_insn (dlo, lo)
     200              :          and then emit_move_insn (dhi, hi).  But the former would
     201              :          invalidate hi's address.  */
     202           11 :       if (rtx_equal_p (dhi, lo))
     203              :         {
     204              :           /* We can't load into dhi first, so load into dlo
     205              :              first and we'll swap.  */
     206            5 :           emit_move_insn (dlo, hi);
     207            5 :           hi = dlo;
     208              :         }
     209              :       else
     210              :         {
     211              :           /* Load into dhi first.  */
     212            6 :           emit_move_insn (dhi, hi);
     213            6 :           hi = dhi;
     214              :         }
     215              :     }
     216       101077 :   if (!rtx_equal_p (dlo, hi))
     217              :     {
     218        87104 :       if (!rtx_equal_p (dlo, lo))
     219        37933 :         emit_move_insn (dlo, lo);
     220              :       else
     221              :         deleted_move_count++;
     222        87104 :       if (!rtx_equal_p (dhi, hi))
     223        81027 :         emit_move_insn (dhi, hi);
     224              :       else
     225         6077 :         deleted_move_count++;
     226              :     }
     227        13973 :   else if (!rtx_equal_p (lo, dhi))
     228              :     {
     229         6999 :       if (!rtx_equal_p (dhi, hi))
     230         6999 :         emit_move_insn (dhi, hi);
     231              :       else
     232              :         deleted_move_count++;
     233         6999 :       if (!rtx_equal_p (dlo, lo))
     234         6899 :         emit_move_insn (dlo, lo);
     235              :       else
     236          100 :         deleted_move_count++;
     237              :     }
     238         6974 :   else if (mode == TImode)
     239         6956 :     emit_insn (gen_swapdi (dlo, dhi));
     240              :   else
     241           18 :     emit_insn (gen_swapsi (dlo, dhi));
     242              : 
     243       101077 :   if (deleted_move_count == 2)
     244         3085 :     emit_note (NOTE_INSN_DELETED);
     245       101077 : }
     246              : 
     247              : 
     248              : /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
     249              :    for the target.  */
     250              : 
     251              : void
     252       112714 : ix86_expand_clear (rtx dest)
     253              : {
     254       112714 :   rtx tmp;
     255              : 
     256              :   /* We play register width games, which are only valid after reload.  */
     257       112714 :   gcc_assert (reload_completed);
     258              : 
     259              :   /* Avoid HImode and its attendant prefix byte.  */
     260       225428 :   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
     261          930 :     dest = gen_rtx_REG (SImode, REGNO (dest));
     262       112714 :   tmp = gen_rtx_SET (dest, const0_rtx);
     263              : 
     264       112714 :   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
     265              :     {
     266       112714 :       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
     267       112714 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
     268              :     }
     269              : 
     270       112714 :   emit_insn (tmp);
     271       112714 : }
     272              : 
     273              : /* Return true if V can be broadcasted from an integer of WIDTH bits
     274              :    which is returned in VAL_BROADCAST.  Otherwise, return false.  */
     275              : 
     276              : static bool
     277         4851 : ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
     278              :                 HOST_WIDE_INT &val_broadcast)
     279              : {
     280         4851 :   wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
     281         4851 :   val_broadcast = wi::extract_uhwi (val, 0, width);
     282         6543 :   for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
     283              :     {
     284         5089 :       HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
     285         5089 :       if (val_broadcast != each)
     286              :         return false;
     287              :     }
     288         1454 :   val_broadcast = sext_hwi (val_broadcast, width);
     289         1454 :   return true;
     290         4851 : }
     291              : 
     292              : /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE.  */
     293              : 
     294              : rtx
     295        32417 : ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
     296              : {
     297              :   /* Don't use integer vector broadcast if we can't move from GPR to SSE
     298              :      register directly.  */
     299        32417 :   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
     300              :     return nullptr;
     301              : 
     302        32417 :   unsigned int msize = GET_MODE_SIZE (mode);
     303              : 
     304              :   /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm.  */
     305        32417 :   if (msize != 16 && msize != 32 && msize != 64)
     306              :     return nullptr;
     307              : 
     308              :   /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
     309              :      broadcast only if vector broadcast is available.  */
     310        32417 :   if (!TARGET_AVX
     311         1610 :       || !CONST_WIDE_INT_P (op)
     312         1603 :       || standard_sse_constant_p (op, mode)
     313        34020 :       || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
     314         1603 :           != GET_MODE_BITSIZE (mode)))
     315        30822 :     return nullptr;
     316              : 
     317         1595 :   HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
     318         1595 :   HOST_WIDE_INT val_broadcast;
     319         1595 :   scalar_int_mode broadcast_mode;
     320              :   /* vpbroadcastb zmm requires TARGET_AVX512BW.  */
     321          712 :   if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
     322         2089 :       && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
     323              :                          val_broadcast))
     324              :     broadcast_mode = QImode;
     325          654 :   else if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
     326         1968 :            && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
     327              :                               val_broadcast))
     328              :     broadcast_mode = HImode;
     329              :   /* vbroadcasts[sd] only support memory operand w/o AVX2.
     330              :      When msize == 16, pshufs is used for vec_duplicate.
     331              :      when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be existed.  */
     332          412 :   else if ((msize != 32 || TARGET_AVX2)
     333         1768 :            && ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
     334              :                            val_broadcast))
     335              :     broadcast_mode = SImode;
     336         1391 :   else if (TARGET_64BIT && (msize != 32 || TARGET_AVX2)
     337         2641 :            && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
     338              :                               val_broadcast))
     339              :     broadcast_mode = DImode;
     340              :   else
     341          141 :     return nullptr;
     342              : 
     343              :   /* Check if OP can be broadcasted from VAL.  */
     344         1776 :   for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
     345         1561 :     if (val != CONST_WIDE_INT_ELT (op, i))
     346              :       return nullptr;
     347              : 
     348          215 :   unsigned int nunits = (GET_MODE_SIZE (mode)
     349          215 :                          / GET_MODE_SIZE (broadcast_mode));
     350          215 :   machine_mode vector_mode;
     351          215 :   if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
     352            0 :     gcc_unreachable ();
     353          215 :   rtx target = gen_reg_rtx (vector_mode);
     354          215 :   bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
     355              :                                                target,
     356              :                                                GEN_INT (val_broadcast));
     357          215 :   if (!ok)
     358              :     return nullptr;
     359          215 :   target = lowpart_subreg (mode, target, vector_mode);
     360          215 :   return target;
     361              : }
     362              : 
     363              : void
     364     73079271 : ix86_expand_move (machine_mode mode, rtx operands[])
     365              : {
     366     73079271 :   rtx op0, op1;
     367     73079271 :   rtx tmp, addend = NULL_RTX;
     368     73079271 :   enum tls_model model;
     369              : 
     370     73079271 :   op0 = operands[0];
     371     73079271 :   op1 = operands[1];
     372              : 
     373              :   /* Avoid complex sets of likely spilled hard registers before reload.  */
     374     73079271 :   if (!ix86_hardreg_mov_ok (op0, op1))
     375              :     {
     376       138303 :       tmp = gen_reg_rtx (mode);
     377       138303 :       operands[0] = tmp;
     378       138303 :       ix86_expand_move (mode, operands);
     379       138303 :       operands[0] = op0;
     380       138303 :       operands[1] = tmp;
     381       138303 :       op1 = tmp;
     382              :     }
     383              : 
     384     73079271 :   switch (GET_CODE (op1))
     385              :     {
     386       348673 :     case CONST:
     387       348673 :       tmp = XEXP (op1, 0);
     388              : 
     389       348673 :       if (GET_CODE (tmp) != PLUS
     390       336991 :           || !SYMBOL_REF_P (XEXP (tmp, 0)))
     391              :         break;
     392              : 
     393       334329 :       op1 = XEXP (tmp, 0);
     394       334329 :       addend = XEXP (tmp, 1);
     395              :       /* FALLTHRU */
     396              : 
     397      4926591 :     case SYMBOL_REF:
     398      4926591 :       model = SYMBOL_REF_TLS_MODEL (op1);
     399              : 
     400      4926591 :       if (model)
     401        10115 :         op1 = legitimize_tls_address (op1, model, true);
     402      4916476 :       else if (ix86_force_load_from_GOT_p (op1))
     403              :         {
     404              :           /* Load the external function address via GOT slot to avoid PLT.  */
     405           24 :           op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
     406              :                                 (TARGET_64BIT
     407              :                                  ? UNSPEC_GOTPCREL
     408              :                                  : UNSPEC_GOT));
     409           24 :           op1 = gen_rtx_CONST (Pmode, op1);
     410           24 :           op1 = gen_const_mem (Pmode, op1);
     411           20 :           set_mem_alias_set (op1, GOT_ALIAS_SET);
     412              :         }
     413              :       else
     414              :         {
     415              : #if TARGET_PECOFF
     416              :           tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
     417              : 
     418              :           if (tmp)
     419              :             {
     420              :               op1 = tmp;
     421              :               if (!addend)
     422              :                 break;
     423              :             }
     424              :           else
     425              : #endif
     426      4916456 :             {
     427      4916456 :               op1 = operands[1];
     428      4916456 :               break;
     429              :             }
     430              :         }
     431              : 
     432        10135 :       if (addend)
     433              :         {
     434         2786 :           op1 = force_operand (op1, NULL_RTX);
     435         2795 :           op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
     436              :                                      op0, 1, OPTAB_DIRECT);
     437              :         }
     438              :       else
     439         7349 :         op1 = force_operand (op1, op0);
     440              : 
     441        10135 :       if (op1 == op0)
     442              :         return;
     443              : 
     444         1148 :       op1 = convert_to_mode (mode, op1, 1);
     445              : 
     446              :     default:
     447              :       break;
     448              : 
     449      1490881 :     case SUBREG:
     450              :       /* Transform TImode paradoxical SUBREG into zero_extendditi2.  */
     451      1490881 :       if (TARGET_64BIT
     452      1262849 :           && mode == TImode
     453              :           && SUBREG_P (op1)
     454        74142 :           && GET_MODE (SUBREG_REG (op1)) == DImode
     455      1536785 :           && SUBREG_BYTE (op1) == 0)
     456        45904 :         op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
     457              :       /* As not all values in XFmode are representable in real_value,
     458              :          we might be called with unfoldable SUBREGs of constants.  */
     459      1490881 :       if (mode == XFmode
     460         3128 :           && CONSTANT_P (SUBREG_REG (op1))
     461            0 :           && can_create_pseudo_p ())
     462              :         {
     463            0 :           machine_mode imode = GET_MODE (SUBREG_REG (op1));
     464            0 :           rtx r = force_const_mem (imode, SUBREG_REG (op1));
     465            0 :           if (r)
     466            0 :             r = validize_mem (r);
     467              :           else
     468            0 :             r = force_reg (imode, SUBREG_REG (op1));
     469            0 :           op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
     470              :         }
     471              :       break;
     472              :     }
     473              : 
     474     73070284 :   if ((flag_pic || MACHOPIC_INDIRECT)
     475     73070284 :       && symbolic_operand (op1, mode))
     476              :     {
     477              : #if TARGET_MACHO
     478              :       if (TARGET_MACHO && !TARGET_64BIT)
     479              :         {
     480              :           /* dynamic-no-pic */
     481              :           if (MACHOPIC_INDIRECT)
     482              :             {
     483              :               tmp = (op0 && REG_P (op0) && mode == Pmode)
     484              :                     ? op0 : gen_reg_rtx (Pmode);
     485              :               op1 = machopic_indirect_data_reference (op1, tmp);
     486              :               if (MACHOPIC_PURE)
     487              :                 op1 = machopic_legitimize_pic_address (op1, mode,
     488              :                                                        tmp == op1 ? 0 : tmp);
     489              :             }
     490              :           if (op0 != op1 && !MEM_P (op0))
     491              :             {
     492              :               rtx insn = gen_rtx_SET (op0, op1);
     493              :               emit_insn (insn);
     494              :               return;
     495              :             }
     496              :         }
     497              : #endif
     498              : 
     499       335349 :       if (MEM_P (op0))
     500        87451 :         op1 = force_reg (mode, op1);
     501       247898 :       else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
     502              :         {
     503       247841 :           rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
     504       247841 :           op1 = legitimize_pic_address (op1, reg);
     505       247841 :           if (op0 == op1)
     506              :             return;
     507       247841 :           op1 = convert_to_mode (mode, op1, 1);
     508              :         }
     509              :     }
     510              :   else
     511              :     {
     512     72734935 :       if (MEM_P (op0)
     513     99276068 :           && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
     514     10733515 :               || !push_operand (op0, mode))
     515     85005446 :           && MEM_P (op1))
     516      2167563 :         op1 = force_reg (mode, op1);
     517              : 
     518     72734935 :       if (push_operand (op0, mode)
     519     72734935 :           && ! general_no_elim_operand (op1, mode))
     520         1004 :         op1 = copy_to_mode_reg (mode, op1);
     521              : 
     522              :       /* Force large constants in 64bit compilation into register
     523              :          to get them CSEed.  */
     524     72734935 :       if (can_create_pseudo_p ()
     525     67032226 :           && (mode == DImode) && TARGET_64BIT
     526     34904279 :           && immediate_operand (op1, mode)
     527      7904329 :           && !x86_64_zext_immediate_operand (op1, VOIDmode)
     528       716810 :           && !register_operand (op0, mode)
     529     72910280 :           && optimize)
     530       123441 :         op1 = copy_to_mode_reg (mode, op1);
     531              : 
     532     72734935 :       if (can_create_pseudo_p ())
     533              :         {
     534     67032226 :           if (CONST_DOUBLE_P (op1))
     535              :             {
     536              :               /* If we are loading a floating point constant to a
     537              :                  register, force the value to memory now, since we'll
     538              :                  get better code out the back end.  */
     539              : 
     540       897739 :               op1 = validize_mem (force_const_mem (mode, op1));
     541       897739 :               if (!register_operand (op0, mode))
     542              :                 {
     543       129390 :                   tmp = gen_reg_rtx (mode);
     544       129390 :                   emit_insn (gen_rtx_SET (tmp, op1));
     545       129390 :                   emit_move_insn (op0, tmp);
     546       129390 :                   return;
     547              :                 }
     548              :             }
     549              :         }
     550              :     }
     551              : 
     552              :   /* Special case inserting 64-bit values into a TImode register.  */
     553     72940894 :   if (TARGET_64BIT
     554              :       /* Disable for -O0 (see PR110587) unless naked (PR110533).  */
     555     63224688 :       && (optimize || ix86_function_naked (current_function_decl))
     556     43278147 :       && (mode == DImode || mode == DFmode)
     557     29513097 :       && SUBREG_P (op0)
     558       484870 :       && GET_MODE (SUBREG_REG (op0)) == TImode
     559       399895 :       && REG_P (SUBREG_REG (op0))
     560     73340789 :       && REG_P (op1))
     561              :     {
     562              :       /* Use *insvti_lowpart_1 to set lowpart.  */
     563       179747 :       if (SUBREG_BYTE (op0) == 0)
     564              :         {
     565        53566 :           wide_int mask = wi::mask (64, true, 128);
     566        53566 :           tmp = immed_wide_int_const (mask, TImode);
     567        53566 :           op0 = SUBREG_REG (op0);
     568        53566 :           tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
     569        53566 :           if (mode == DFmode)
     570          350 :             op1 = gen_lowpart (DImode, op1);
     571        53566 :           op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
     572        53566 :           op1 = gen_rtx_IOR (TImode, tmp, op1);
     573        53566 :         }
     574              :       /* Use *insvti_highpart_1 to set highpart.  */
     575       126181 :       else if (SUBREG_BYTE (op0) == 8)
     576              :         {
     577       126181 :           wide_int mask = wi::mask (64, false, 128);
     578       126181 :           tmp = immed_wide_int_const (mask, TImode);
     579       126181 :           op0 = SUBREG_REG (op0);
     580       126181 :           tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
     581       126181 :           if (mode == DFmode)
     582          201 :             op1 = gen_lowpart (DImode, op1);
     583       126181 :           op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
     584       126181 :           op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
     585       126181 :           op1 = gen_rtx_IOR (TImode, tmp, op1);
     586       126181 :         }
     587              :     }
     588              : 
     589     72940894 :   emit_insn (gen_rtx_SET (op0, op1));
     590              : }
     591              : 
     592              : /* OP is a memref of CONST_VECTOR, return scalar constant mem
     593              :    if CONST_VECTOR is a vec_duplicate, else return NULL.  */
     594              : rtx
     595      2457355 : ix86_broadcast_from_constant (machine_mode mode, rtx op)
     596              : {
     597      2457355 :   int nunits = GET_MODE_NUNITS (mode);
     598      2457355 :   if (nunits < 2)
     599              :     return nullptr;
     600              : 
     601              :   /* Don't use integer vector broadcast if we can't move from GPR to SSE
     602              :      register directly.  */
     603      2330910 :   if (!TARGET_INTER_UNIT_MOVES_TO_VEC
     604         8020 :       && INTEGRAL_MODE_P (mode))
     605              :     return nullptr;
     606              : 
     607              :   /* Convert CONST_VECTOR to a non-standard SSE constant integer
     608              :      broadcast only if vector broadcast is available.  */
     609      2325500 :   if (standard_sse_constant_p (op, mode))
     610              :     return nullptr;
     611              : 
     612      4650994 :   if (GET_MODE_INNER (mode) == TImode)
     613              :     return nullptr;
     614              : 
     615      2325387 :   rtx constant = get_pool_constant (XEXP (op, 0));
     616      2325387 :   if (!CONST_VECTOR_P (constant))
     617              :     return nullptr;
     618              : 
     619              :   /* There could be some rtx like
     620              :      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
     621              :      but with "*.LC1" refer to V2DI constant vector.  */
     622      2325387 :   if (GET_MODE (constant) != mode)
     623              :     {
     624          652 :       constant = simplify_subreg (mode, constant, GET_MODE (constant),
     625              :                                   0);
     626          652 :       if (constant == nullptr || !CONST_VECTOR_P (constant))
     627              :         return nullptr;
     628              :     }
     629              : 
     630      2325387 :   rtx first = XVECEXP (constant, 0, 0);
     631              : 
     632      7697007 :   for (int i = 1; i < nunits; ++i)
     633              :     {
     634      7083100 :       rtx tmp = XVECEXP (constant, 0, i);
     635              :       /* Vector duplicate value.  */
     636      7083100 :       if (!rtx_equal_p (tmp, first))
     637              :         return nullptr;
     638              :     }
     639              : 
     640              :   return first;
     641              : }
     642              : 
     643              : void
     644      4810921 : ix86_expand_vector_move (machine_mode mode, rtx operands[])
     645              : {
     646      4810921 :   rtx op0 = operands[0], op1 = operands[1];
     647              :   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
     648              :      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
     649      4810921 :   unsigned int align = (TARGET_IAMCU
     650      4810921 :                         ? GET_MODE_BITSIZE (mode)
     651      4810921 :                         : GET_MODE_ALIGNMENT (mode));
     652              : 
     653      4810921 :   if (push_operand (op0, VOIDmode))
     654         2899 :     op0 = emit_move_resolve_push (mode, op0);
     655              : 
     656              :   /* Force constants other than zero into memory.  We do not know how
     657              :      the instructions used to build constants modify the upper 64 bits
     658              :      of the register, once we have that information we may be able
     659              :      to handle some of them more efficiently.  */
     660      4810921 :   if (can_create_pseudo_p ()
     661      4616121 :       && (CONSTANT_P (op1)
     662      4302536 :           || (SUBREG_P (op1)
     663       309306 :               && CONSTANT_P (SUBREG_REG (op1))))
     664      5124520 :       && ((register_operand (op0, mode)
     665       259869 :            && !standard_sse_constant_p (op1, mode))
     666              :           /* ix86_expand_vector_move_misalign() does not like constants.  */
     667              :           || (SSE_REG_MODE_P (mode)
     668       256983 :               && MEM_P (op0)
     669        38436 :               && MEM_ALIGN (op0) < align)))
     670              :     {
     671         2249 :       if (SUBREG_P (op1))
     672              :         {
     673           14 :           machine_mode imode = GET_MODE (SUBREG_REG (op1));
     674           14 :           rtx r = force_const_mem (imode, SUBREG_REG (op1));
     675           14 :           if (r)
     676           14 :             r = validize_mem (r);
     677              :           else
     678            0 :             r = force_reg (imode, SUBREG_REG (op1));
     679           14 :           op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
     680              :         }
     681              :       else
     682              :         {
     683         2235 :           machine_mode mode = GET_MODE (op0);
     684         2235 :           rtx tmp = ix86_convert_const_wide_int_to_broadcast
     685         2235 :             (mode, op1);
     686         2235 :           if (tmp == nullptr)
     687         2214 :             op1 = validize_mem (force_const_mem (mode, op1));
     688              :           else
     689              :             op1 = tmp;
     690              :         }
     691              :     }
     692              : 
     693      4810921 :   if (can_create_pseudo_p ()
     694      4616121 :       && GET_MODE_SIZE (mode) >= 16
     695      3892767 :       && VECTOR_MODE_P (mode)
     696      8488336 :       && (MEM_P (op1)
     697       877624 :           && SYMBOL_REF_P (XEXP (op1, 0))
     698       495269 :           && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
     699              :     {
     700       478672 :       rtx first = ix86_broadcast_from_constant (mode, op1);
     701       478672 :       if (first != nullptr)
     702              :         {
     703              :           /* Broadcast to XMM/YMM/ZMM register from an integer
     704              :              constant or scalar mem.  */
     705       124259 :           rtx tmp = gen_reg_rtx (mode);
     706       124259 :           if (FLOAT_MODE_P (mode))
     707        29276 :             first = force_const_mem (GET_MODE_INNER (mode), first);
     708       124259 :           bool ok = ix86_expand_vector_init_duplicate (false, mode,
     709              :                                                        tmp, first);
     710       124259 :           if (!ok && !TARGET_64BIT && GET_MODE_INNER (mode) == DImode)
     711              :             {
     712            0 :               first = force_const_mem (GET_MODE_INNER (mode), first);
     713            0 :               ok = ix86_expand_vector_init_duplicate (false, mode,
     714              :                                                       tmp, first);
     715              :             }
     716       124259 :           if (ok)
     717              :             {
     718       124259 :               emit_move_insn (op0, tmp);
     719       124259 :               return;
     720              :             }
     721              :         }
     722              :     }
     723              : 
     724              :   /* We need to check memory alignment for SSE mode since attribute
     725              :      can make operands unaligned.  */
     726      4686662 :   if (can_create_pseudo_p ()
     727              :       && SSE_REG_MODE_P (mode)
     728      9503907 :       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
     729      4221004 :           || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
     730              :     {
     731       493548 :       rtx tmp[2];
     732              : 
     733              :       /* ix86_expand_vector_move_misalign() does not like both
     734              :          arguments in memory.  */
     735       493548 :       if (!register_operand (op0, mode)
     736       493548 :           && !register_operand (op1, mode))
     737              :         {
     738       154795 :           rtx scratch = gen_reg_rtx (mode);
     739       154795 :           emit_move_insn (scratch, op1);
     740       154795 :           op1 = scratch;
     741              :         }
     742              : 
     743       493548 :       tmp[0] = op0; tmp[1] = op1;
     744       493548 :       ix86_expand_vector_move_misalign (mode, tmp);
     745       493548 :       return;
     746              :     }
     747              : 
     748              :   /* Special case TImode to 128-bit vector conversions via V2DI.  */
     749      1139970 :   if (VECTOR_MODE_P (mode)
     750      4141858 :       && GET_MODE_SIZE (mode) == 16
     751      2925852 :       && SUBREG_P (op1)
     752       242160 :       && GET_MODE (SUBREG_REG (op1)) == TImode
     753         3276 :       && TARGET_64BIT && TARGET_SSE
     754      4195736 :       && ix86_pre_reload_split ())
     755              :     {
     756         2515 :       rtx tmp = gen_reg_rtx (V2DImode);
     757         2515 :       rtx lo = gen_reg_rtx (DImode);
     758         2515 :       rtx hi = gen_reg_rtx (DImode);
     759         2515 :       emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
     760         2515 :       emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
     761         2515 :       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
     762         2515 :       emit_move_insn (op0, gen_lowpart (mode, tmp));
     763         2515 :       return;
     764              :     }
     765              : 
     766              :   /* If operand0 is a hard register, make operand1 a pseudo.  */
     767      4190599 :   if (can_create_pseudo_p ()
     768      8186398 :       && !ix86_hardreg_mov_ok (op0, op1))
     769              :     {
     770          134 :       rtx tmp = gen_reg_rtx (GET_MODE (op0));
     771          134 :       emit_move_insn (tmp, op1);
     772          134 :       emit_move_insn (op0, tmp);
     773          134 :       return;
     774              :     }
     775              : 
     776              :   /* Make operand1 a register if it isn't already.  */
     777      4190465 :   if (can_create_pseudo_p ()
     778      3995665 :       && !register_operand (op0, mode)
     779      5304609 :       && !register_operand (op1, mode))
     780              :     {
     781       215540 :       rtx tmp = gen_reg_rtx (GET_MODE (op0));
     782       215540 :       emit_move_insn (tmp, op1);
     783       215540 :       emit_move_insn (op0, tmp);
     784       215540 :       return;
     785              :     }
     786              : 
     787      3974925 :   emit_insn (gen_rtx_SET (op0, op1));
     788              : }
     789              : 
     790              : /* Split 32-byte AVX unaligned load and store if needed.  */
     791              : 
     792              : static void
     793        12554 : ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
     794              : {
     795        12554 :   rtx m;
     796        12554 :   rtx (*extract) (rtx, rtx, rtx);
     797        12554 :   machine_mode mode;
     798              : 
     799        12554 :   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
     800         4558 :       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
     801              :     {
     802        12528 :       emit_insn (gen_rtx_SET (op0, op1));
     803        12528 :       return;
     804              :     }
     805              : 
     806           26 :   rtx orig_op0 = NULL_RTX;
     807           26 :   mode = GET_MODE (op0);
     808           26 :   switch (GET_MODE_CLASS (mode))
     809              :     {
     810            9 :     case MODE_VECTOR_INT:
     811            9 :     case MODE_INT:
     812            9 :       if (mode != V32QImode)
     813              :         {
     814            7 :           if (!MEM_P (op0))
     815              :             {
     816            3 :               orig_op0 = op0;
     817            3 :               op0 = gen_reg_rtx (V32QImode);
     818              :             }
     819              :           else
     820            4 :             op0 = gen_lowpart (V32QImode, op0);
     821            7 :           op1 = gen_lowpart (V32QImode, op1);
     822            7 :           mode = V32QImode;
     823              :         }
     824              :       break;
     825              :     case MODE_VECTOR_FLOAT:
     826              :       break;
     827            0 :     default:
     828            0 :       gcc_unreachable ();
     829              :     }
     830              : 
     831           26 :   switch (mode)
     832              :     {
     833            0 :     default:
     834            0 :       gcc_unreachable ();
     835              :     case E_V32QImode:
     836              :       extract = gen_avx_vextractf128v32qi;
     837              :       mode = V16QImode;
     838              :       break;
     839            1 :     case E_V16BFmode:
     840            1 :       extract = gen_avx_vextractf128v16bf;
     841            1 :       mode = V8BFmode;
     842            1 :       break;
     843            0 :     case E_V16HFmode:
     844            0 :       extract = gen_avx_vextractf128v16hf;
     845            0 :       mode = V8HFmode;
     846            0 :       break;
     847            8 :     case E_V8SFmode:
     848            8 :       extract = gen_avx_vextractf128v8sf;
     849            8 :       mode = V4SFmode;
     850            8 :       break;
     851            8 :     case E_V4DFmode:
     852            8 :       extract = gen_avx_vextractf128v4df;
     853            8 :       mode = V2DFmode;
     854            8 :       break;
     855              :     }
     856              : 
     857           26 :   if (MEM_P (op1))
     858              :     {
     859            9 :       rtx r = gen_reg_rtx (mode);
     860            9 :       m = adjust_address (op1, mode, 0);
     861            9 :       emit_move_insn (r, m);
     862            9 :       m = adjust_address (op1, mode, 16);
     863            9 :       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
     864            9 :       emit_move_insn (op0, r);
     865              :     }
     866           17 :   else if (MEM_P (op0))
     867              :     {
     868           17 :       m = adjust_address (op0, mode, 0);
     869           17 :       emit_insn (extract (m, op1, const0_rtx));
     870           17 :       m = adjust_address (op0, mode, 16);
     871           17 :       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
     872              :     }
     873              :   else
     874            0 :     gcc_unreachable ();
     875              : 
     876           26 :   if (orig_op0)
     877            3 :     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
     878              : }
     879              : 
     880              : /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
     881              :    straight to ix86_expand_vector_move.  */
     882              : /* Code generation for scalar reg-reg moves of single and double precision data:
     883              :      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
     884              :        movaps reg, reg
     885              :      else
     886              :        movss reg, reg
     887              :      if (x86_sse_partial_reg_dependency == true)
     888              :        movapd reg, reg
     889              :      else
     890              :        movsd reg, reg
     891              : 
     892              :    Code generation for scalar loads of double precision data:
     893              :      if (x86_sse_split_regs == true)
     894              :        movlpd mem, reg      (gas syntax)
     895              :      else
     896              :        movsd mem, reg
     897              : 
     898              :    Code generation for unaligned packed loads of single precision data
     899              :    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
     900              :      if (x86_sse_unaligned_move_optimal)
     901              :        movups mem, reg
     902              : 
     903              :      if (x86_sse_partial_reg_dependency == true)
     904              :        {
     905              :          xorps  reg, reg
     906              :          movlps mem, reg
     907              :          movhps mem+8, reg
     908              :        }
     909              :      else
     910              :        {
     911              :          movlps mem, reg
     912              :          movhps mem+8, reg
     913              :        }
     914              : 
     915              :    Code generation for unaligned packed loads of double precision data
     916              :    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
     917              :      if (x86_sse_unaligned_move_optimal)
     918              :        movupd mem, reg
     919              : 
     920              :      if (x86_sse_split_regs == true)
     921              :        {
     922              :          movlpd mem, reg
     923              :          movhpd mem+8, reg
     924              :        }
     925              :      else
     926              :        {
     927              :          movsd  mem, reg
     928              :          movhpd mem+8, reg
     929              :        }
     930              :  */
     931              : 
     932              : void
     933       822016 : ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
     934              : {
     935       822016 :   rtx op0, op1, m;
     936              : 
     937       822016 :   op0 = operands[0];
     938       822016 :   op1 = operands[1];
     939              : 
     940              :   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
     941      1644032 :   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
     942              :     {
     943        24075 :       emit_insn (gen_rtx_SET (op0, op1));
     944        24075 :       return;
     945              :     }
     946              : 
     947       797941 :   if (TARGET_AVX)
     948              :     {
     949        62002 :       if (GET_MODE_SIZE (mode) == 32)
     950        12554 :         ix86_avx256_split_vector_move_misalign (op0, op1);
     951              :       else
     952              :         /* Always use 128-bit mov<mode>_internal pattern for AVX.  */
     953        18447 :         emit_insn (gen_rtx_SET (op0, op1));
     954        31001 :       return;
     955              :     }
     956              : 
     957       766940 :   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
     958           95 :       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
     959              :     {
     960       766845 :       emit_insn (gen_rtx_SET (op0, op1));
     961       766845 :       return;
     962              :     }
     963              : 
     964              :   /* ??? If we have typed data, then it would appear that using
     965              :      movdqu is the only way to get unaligned data loaded with
     966              :      integer type.  */
     967           95 :   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
     968              :     {
     969           81 :       emit_insn (gen_rtx_SET (op0, op1));
     970           81 :       return;
     971              :     }
     972              : 
     973           14 :   if (MEM_P (op1))
     974              :     {
     975            6 :       if (TARGET_SSE2 && mode == V2DFmode)
     976              :         {
     977            2 :           rtx zero;
     978              : 
     979              :           /* When SSE registers are split into halves, we can avoid
     980              :              writing to the top half twice.  */
     981            2 :           if (TARGET_SSE_SPLIT_REGS)
     982              :             {
     983            2 :               emit_clobber (op0);
     984            2 :               zero = op0;
     985              :             }
     986              :           else
     987              :             {
     988              :               /* ??? Not sure about the best option for the Intel chips.
     989              :                  The following would seem to satisfy; the register is
     990              :                  entirely cleared, breaking the dependency chain.  We
     991              :                  then store to the upper half, with a dependency depth
     992              :                  of one.  A rumor has it that Intel recommends two movsd
     993              :                  followed by an unpacklpd, but this is unconfirmed.  And
     994              :                  given that the dependency depth of the unpacklpd would
     995              :                  still be one, I'm not sure why this would be better.  */
     996            0 :               zero = CONST0_RTX (V2DFmode);
     997              :             }
     998              : 
     999            2 :           m = adjust_address (op1, DFmode, 0);
    1000            2 :           emit_insn (gen_sse2_loadlpd (op0, zero, m));
    1001            2 :           m = adjust_address (op1, DFmode, 8);
    1002            2 :           emit_insn (gen_sse2_loadhpd (op0, op0, m));
    1003            2 :         }
    1004              :       else
    1005              :         {
    1006            4 :           rtx t;
    1007              : 
    1008            4 :           if (mode != V4SFmode)
    1009            0 :             t = gen_reg_rtx (V4SFmode);
    1010              :           else
    1011              :             t = op0;
    1012              : 
    1013            4 :           if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
    1014            2 :             emit_move_insn (t, CONST0_RTX (V4SFmode));
    1015              :           else
    1016            2 :             emit_clobber (t);
    1017              : 
    1018            4 :           m = adjust_address (op1, V2SFmode, 0);
    1019            4 :           emit_insn (gen_sse_loadlps (t, t, m));
    1020            4 :           m = adjust_address (op1, V2SFmode, 8);
    1021            4 :           emit_insn (gen_sse_loadhps (t, t, m));
    1022            4 :           if (mode != V4SFmode)
    1023            0 :             emit_move_insn (op0, gen_lowpart (mode, t));
    1024              :         }
    1025              :     }
    1026            8 :   else if (MEM_P (op0))
    1027              :     {
    1028            8 :       if (TARGET_SSE2 && mode == V2DFmode)
    1029              :         {
    1030            2 :           m = adjust_address (op0, DFmode, 0);
    1031            2 :           emit_insn (gen_sse2_storelpd (m, op1));
    1032            2 :           m = adjust_address (op0, DFmode, 8);
    1033            2 :           emit_insn (gen_sse2_storehpd (m, op1));
    1034              :         }
    1035              :       else
    1036              :         {
    1037            6 :           if (mode != V4SFmode)
    1038            0 :             op1 = gen_lowpart (V4SFmode, op1);
    1039              : 
    1040            6 :           m = adjust_address (op0, V2SFmode, 0);
    1041            6 :           emit_insn (gen_sse_storelps (m, op1));
    1042            6 :           m = adjust_address (op0, V2SFmode, 8);
    1043            6 :           emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
    1044              :         }
    1045              :     }
    1046              :   else
    1047            0 :     gcc_unreachable ();
    1048              : }
    1049              : 
    1050              : /* Move bits 64:95 to bits 32:63.  */
    1051              : 
    1052              : void
    1053          854 : ix86_move_vector_high_sse_to_mmx (rtx op)
    1054              : {
    1055          854 :   rtx mask = gen_rtx_PARALLEL (VOIDmode,
    1056              :                                gen_rtvec (4, GEN_INT (0), GEN_INT (2),
    1057              :                                           GEN_INT (0), GEN_INT (0)));
    1058          854 :   rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
    1059          854 :   op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
    1060          854 :   rtx insn = gen_rtx_SET (dest, op);
    1061          854 :   emit_insn (insn);
    1062          854 : }
    1063              : 
    1064              : /* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
    1065              : 
    1066              : void
    1067          764 : ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
    1068              : {
    1069          764 :   rtx op0 = operands[0];
    1070          764 :   rtx op1 = operands[1];
    1071          764 :   rtx op2 = operands[2];
    1072          764 :   rtx src;
    1073              : 
    1074          764 :   machine_mode dmode = GET_MODE (op0);
    1075          764 :   machine_mode smode = GET_MODE (op1);
    1076          764 :   machine_mode inner_dmode = GET_MODE_INNER (dmode);
    1077          764 :   machine_mode inner_smode = GET_MODE_INNER (smode);
    1078              : 
    1079              :   /* Get the corresponding SSE mode for destination.  */
    1080          764 :   int nunits = 16 / GET_MODE_SIZE (inner_dmode);
    1081         1528 :   machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
    1082         1528 :                                             nunits).require ();
    1083          764 :   machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
    1084         1528 :                                                  nunits / 2).require ();
    1085              : 
    1086              :   /* Get the corresponding SSE mode for source.  */
    1087          764 :   nunits = 16 / GET_MODE_SIZE (inner_smode);
    1088         1528 :   machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
    1089         1528 :                                             nunits).require ();
    1090              : 
    1091              :   /* Generate SSE pack with signed/unsigned saturation.  */
    1092          764 :   rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
    1093          764 :   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
    1094          764 :   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
    1095              : 
    1096              :   /* paskusdw/packuswb does unsigned saturation of a signed source
    1097              :      which is different from generic us_truncate RTX.  */
    1098          764 :   if (code == US_TRUNCATE)
    1099          662 :     src = gen_rtx_UNSPEC (sse_dmode,
    1100              :                           gen_rtvec (2, op1, op2),
    1101              :                           UNSPEC_US_TRUNCATE);
    1102              :   else
    1103              :     {
    1104          102 :       op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
    1105          102 :       op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
    1106          102 :       src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
    1107              :     }
    1108              : 
    1109          764 :   emit_move_insn (dest, src);
    1110              : 
    1111          764 :   ix86_move_vector_high_sse_to_mmx (op0);
    1112          764 : }
    1113              : 
    1114              : /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  This is also used
    1115              :    for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
    1116              :    OPERANDS[0].  */
    1117              : 
    1118              : void
    1119         5725 : ix86_split_mmx_punpck (rtx operands[], bool high_p)
    1120              : {
    1121         5725 :   rtx op0 = operands[0];
    1122         5725 :   rtx op1 = operands[1];
    1123         5725 :   rtx op2 = operands[2];
    1124         5725 :   machine_mode mode = GET_MODE (op1);
    1125         5725 :   rtx mask;
    1126              :   /* The corresponding SSE mode.  */
    1127         5725 :   machine_mode sse_mode, double_sse_mode;
    1128              : 
    1129         5725 :   switch (mode)
    1130              :     {
    1131         1513 :     case E_V8QImode:
    1132         1513 :     case E_V4QImode:
    1133         1513 :     case E_V2QImode:
    1134         1513 :       sse_mode = V16QImode;
    1135         1513 :       double_sse_mode = V32QImode;
    1136         1513 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1137              :                                gen_rtvec (16,
    1138              :                                           GEN_INT (0), GEN_INT (16),
    1139              :                                           GEN_INT (1), GEN_INT (17),
    1140              :                                           GEN_INT (2), GEN_INT (18),
    1141              :                                           GEN_INT (3), GEN_INT (19),
    1142              :                                           GEN_INT (4), GEN_INT (20),
    1143              :                                           GEN_INT (5), GEN_INT (21),
    1144              :                                           GEN_INT (6), GEN_INT (22),
    1145              :                                           GEN_INT (7), GEN_INT (23)));
    1146         1513 :       break;
    1147              : 
    1148         3080 :     case E_V4HImode:
    1149         3080 :     case E_V2HImode:
    1150         3080 :       sse_mode = V8HImode;
    1151         3080 :       double_sse_mode = V16HImode;
    1152         3080 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1153              :                                gen_rtvec (8,
    1154              :                                           GEN_INT (0), GEN_INT (8),
    1155              :                                           GEN_INT (1), GEN_INT (9),
    1156              :                                           GEN_INT (2), GEN_INT (10),
    1157              :                                           GEN_INT (3), GEN_INT (11)));
    1158         3080 :       break;
    1159              : 
    1160          795 :     case E_V2SImode:
    1161          795 :       sse_mode = V4SImode;
    1162          795 :       double_sse_mode = V8SImode;
    1163          795 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1164              :                                gen_rtvec (4,
    1165              :                                           GEN_INT (0), GEN_INT (4),
    1166              :                                           GEN_INT (1), GEN_INT (5)));
    1167          795 :       break;
    1168              : 
    1169          337 :     case E_V2SFmode:
    1170          337 :       sse_mode = V4SFmode;
    1171          337 :       double_sse_mode = V8SFmode;
    1172          337 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1173              :                                gen_rtvec (4,
    1174              :                                           GEN_INT (0), GEN_INT (4),
    1175              :                                           GEN_INT (1), GEN_INT (5)));
    1176          337 :       break;
    1177              : 
    1178            0 :     default:
    1179            0 :       gcc_unreachable ();
    1180              :     }
    1181              : 
    1182              :   /* Generate SSE punpcklXX.  */
    1183         5725 :   rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
    1184         5725 :   op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
    1185         5725 :   op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
    1186              : 
    1187         5725 :   op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
    1188         5725 :   op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
    1189         5725 :   rtx insn = gen_rtx_SET (dest, op2);
    1190         5725 :   emit_insn (insn);
    1191              : 
    1192              :   /* Move high bits to low bits.  */
    1193         5725 :   if (high_p)
    1194              :     {
    1195         2294 :       if (sse_mode == V4SFmode)
    1196              :         {
    1197          119 :           mask = gen_rtx_PARALLEL (VOIDmode,
    1198              :                                    gen_rtvec (4, GEN_INT (2), GEN_INT (3),
    1199              :                                               GEN_INT (4), GEN_INT (5)));
    1200          119 :           op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
    1201          119 :           op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
    1202              :         }
    1203              :       else
    1204              :         {
    1205         2175 :           int sz = GET_MODE_SIZE (mode);
    1206              : 
    1207         2175 :           if (sz == 4)
    1208          239 :             mask = gen_rtx_PARALLEL (VOIDmode,
    1209              :                                      gen_rtvec (4, GEN_INT (1), GEN_INT (0),
    1210              :                                                 GEN_INT (0), GEN_INT (1)));
    1211         1936 :           else if (sz == 8)
    1212         1936 :             mask = gen_rtx_PARALLEL (VOIDmode,
    1213              :                                      gen_rtvec (4, GEN_INT (2), GEN_INT (3),
    1214              :                                                 GEN_INT (0), GEN_INT (1)));
    1215              :           else
    1216            0 :             gcc_unreachable ();
    1217              : 
    1218         2175 :           dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
    1219         2175 :           op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
    1220              :         }
    1221              : 
    1222         2294 :       insn = gen_rtx_SET (dest, op1);
    1223         2294 :       emit_insn (insn);
    1224              :     }
    1225         5725 : }
    1226              : 
    1227              : /* Helper function of ix86_fixup_binary_operands to canonicalize
    1228              :    operand order.  Returns true if the operands should be swapped.  */
    1229              : 
    1230              : static bool
    1231    174113385 : ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
    1232              :                              rtx operands[])
    1233              : {
    1234    174113385 :   rtx dst = operands[0];
    1235    174113385 :   rtx src1 = operands[1];
    1236    174113385 :   rtx src2 = operands[2];
    1237              : 
    1238              :   /* If the operation is not commutative, we can't do anything.  */
    1239    174113385 :   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
    1240     26884473 :       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
    1241              :     return false;
    1242              : 
    1243              :   /* Highest priority is that src1 should match dst.  */
    1244    147241004 :   if (rtx_equal_p (dst, src1))
    1245              :     return false;
    1246    106820981 :   if (rtx_equal_p (dst, src2))
    1247              :     return true;
    1248              : 
    1249              :   /* Next highest priority is that immediate constants come second.  */
    1250    106738470 :   if (immediate_operand (src2, mode))
    1251              :     return false;
    1252     25728596 :   if (immediate_operand (src1, mode))
    1253              :     return true;
    1254              : 
    1255              :   /* Lowest priority is that memory references should come second.  */
    1256     25728596 :   if (MEM_P (src2))
    1257              :     return false;
    1258     24296767 :   if (MEM_P (src1))
    1259              :     return true;
    1260              : 
    1261              :   return false;
    1262              : }
    1263              : 
    1264              : /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
    1265              :    destination to use for the operation.  If different from the true
    1266              :    destination in operands[0], a copy operation will be required except
    1267              :    under TARGET_APX_NDD.  */
    1268              : 
    1269              : rtx
    1270     13505221 : ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
    1271              :                             rtx operands[], bool use_ndd)
    1272              : {
    1273     13505221 :   rtx dst = operands[0];
    1274     13505221 :   rtx src1 = operands[1];
    1275     13505221 :   rtx src2 = operands[2];
    1276              : 
    1277              :   /* Canonicalize operand order.  */
    1278     13505221 :   if (ix86_swap_binary_operands_p (code, mode, operands))
    1279              :     {
    1280              :       /* It is invalid to swap operands of different modes.  */
    1281        88322 :       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
    1282              : 
    1283              :       std::swap (src1, src2);
    1284              :     }
    1285              : 
    1286              :   /* Both source operands cannot be in memory.  */
    1287     13505221 :   if (MEM_P (src1) && MEM_P (src2))
    1288              :     {
    1289              :       /* Optimization: Only read from memory once.  */
    1290       110575 :       if (rtx_equal_p (src1, src2))
    1291              :         {
    1292           17 :           src2 = force_reg (mode, src2);
    1293           17 :           src1 = src2;
    1294              :         }
    1295       110558 :       else if (rtx_equal_p (dst, src1))
    1296         3424 :         src2 = force_reg (mode, src2);
    1297              :       else
    1298       107134 :         src1 = force_reg (mode, src1);
    1299              :     }
    1300              : 
    1301              :   /* If the destination is memory, and we do not have matching source
    1302              :      operands, do things in registers.  */
    1303     13505221 :   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
    1304       485024 :     dst = gen_reg_rtx (mode);
    1305              : 
    1306              :   /* Source 1 cannot be a constant.  */
    1307     13505221 :   if (CONSTANT_P (src1))
    1308          714 :     src1 = force_reg (mode, src1);
    1309              : 
    1310              :   /* Source 1 cannot be a non-matching memory.  */
    1311     13505221 :   if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
    1312       438897 :     src1 = force_reg (mode, src1);
    1313              : 
    1314              :   /* Improve address combine.  */
    1315     13505221 :   if (code == PLUS
    1316      9928623 :       && GET_MODE_CLASS (mode) == MODE_INT
    1317      9817445 :       && MEM_P (src2))
    1318       177520 :     src2 = force_reg (mode, src2);
    1319              : 
    1320     13505221 :   operands[1] = src1;
    1321     13505221 :   operands[2] = src2;
    1322     13505221 :   return dst;
    1323              : }
    1324              : 
    1325              : /* Similarly, but assume that the destination has already been
    1326              :    set up properly.  */
    1327              : 
    1328              : void
    1329       294264 : ix86_fixup_binary_operands_no_copy (enum rtx_code code,
    1330              :                                     machine_mode mode, rtx operands[],
    1331              :                                     bool use_ndd)
    1332              : {
    1333       294264 :   rtx dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
    1334       294264 :   gcc_assert (dst == operands[0]);
    1335       294264 : }
    1336              : 
    1337              : /* Attempt to expand a binary operator.  Make the expansion closer to the
    1338              :    actual machine, then just general_operand, which will allow 3 separate
    1339              :    memory references (one output, two input) in a single insn.  */
    1340              : 
    1341              : void
    1342     13210828 : ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
    1343              :                              rtx operands[], bool use_ndd)
    1344              : {
    1345     13210828 :   rtx src1, src2, dst, op, clob;
    1346              : 
    1347     13210828 :   dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
    1348     13210828 :   src1 = operands[1];
    1349     13210828 :   src2 = operands[2];
    1350              : 
    1351              :  /* Emit the instruction.  */
    1352              : 
    1353     13210828 :   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
    1354              : 
    1355     13210828 :   if (reload_completed
    1356        80480 :       && code == PLUS
    1357          908 :       && !rtx_equal_p (dst, src1)
    1358     13210828 :       && !use_ndd)
    1359              :     {
    1360              :       /* This is going to be an LEA; avoid splitting it later.  */
    1361            0 :       emit_insn (op);
    1362              :     }
    1363              :   else
    1364              :     {
    1365     13210828 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1366     13210828 :       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1367              :     }
    1368              : 
    1369              :   /* Fix up the destination if needed.  */
    1370     13210828 :   if (dst != operands[0])
    1371       485015 :     emit_move_insn (operands[0], dst);
    1372     13210828 : }
    1373              : 
    1374              : /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
    1375              :    the given OPERANDS.  */
    1376              : 
    1377              : void
    1378        83306 : ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
    1379              :                                      rtx operands[])
    1380              : {
    1381        83306 :   rtx op1 = NULL_RTX, op2 = NULL_RTX;
    1382        83306 :   if (SUBREG_P (operands[1]))
    1383              :     {
    1384          312 :       op1 = operands[1];
    1385          312 :       op2 = operands[2];
    1386              :     }
    1387        82994 :   else if (SUBREG_P (operands[2]))
    1388              :     {
    1389              :       op1 = operands[2];
    1390              :       op2 = operands[1];
    1391              :     }
    1392              :   /* Optimize (__m128i) d | (__m128i) e and similar code
    1393              :      when d and e are float vectors into float vector logical
    1394              :      insn.  In C/C++ without using intrinsics there is no other way
    1395              :      to express vector logical operation on float vectors than
    1396              :      to cast them temporarily to integer vectors.  */
    1397         3145 :   if (op1
    1398         3145 :       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
    1399         3145 :       && (SUBREG_P (op2) || CONST_VECTOR_P (op2))
    1400          298 :       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
    1401          303 :       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
    1402          101 :       && SUBREG_BYTE (op1) == 0
    1403          101 :       && (CONST_VECTOR_P (op2)
    1404            1 :           || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
    1405            1 :               && SUBREG_BYTE (op2) == 0))
    1406          101 :       && can_create_pseudo_p ())
    1407              :     {
    1408          101 :       rtx dst;
    1409          101 :       switch (GET_MODE (SUBREG_REG (op1)))
    1410              :         {
    1411           17 :         case E_V4SFmode:
    1412           17 :         case E_V8SFmode:
    1413           17 :         case E_V16SFmode:
    1414           17 :         case E_V2DFmode:
    1415           17 :         case E_V4DFmode:
    1416           17 :         case E_V8DFmode:
    1417           17 :           dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
    1418           17 :           if (CONST_VECTOR_P (op2))
    1419              :             {
    1420           16 :               op2 = gen_lowpart (GET_MODE (dst), op2);
    1421           16 :               op2 = force_reg (GET_MODE (dst), op2);
    1422              :             }
    1423              :           else
    1424              :             {
    1425            1 :               op1 = operands[1];
    1426            1 :               op2 = SUBREG_REG (operands[2]);
    1427            1 :               if (!vector_operand (op2, GET_MODE (dst)))
    1428            0 :                 op2 = force_reg (GET_MODE (dst), op2);
    1429              :             }
    1430           17 :           op1 = SUBREG_REG (op1);
    1431           17 :           if (!vector_operand (op1, GET_MODE (dst)))
    1432            0 :             op1 = force_reg (GET_MODE (dst), op1);
    1433           17 :           emit_insn (gen_rtx_SET (dst,
    1434              :                                   gen_rtx_fmt_ee (code, GET_MODE (dst),
    1435              :                                                   op1, op2)));
    1436           17 :           emit_move_insn (operands[0], gen_lowpart (mode, dst));
    1437           17 :           return;
    1438              :         default:
    1439              :           break;
    1440              :         }
    1441              :     }
    1442        83289 :   if (!vector_operand (operands[1], mode))
    1443            0 :     operands[1] = force_reg (mode, operands[1]);
    1444        83289 :   if (!vector_operand (operands[2], mode))
    1445        11277 :     operands[2] = force_reg (mode, operands[2]);
    1446        83289 :   ix86_fixup_binary_operands_no_copy (code, mode, operands);
    1447        83289 :   emit_insn (gen_rtx_SET (operands[0],
    1448              :                           gen_rtx_fmt_ee (code, mode, operands[1],
    1449              :                                           operands[2])));
    1450              : }
    1451              : 
    1452              : /* Return TRUE or FALSE depending on whether the binary operator meets the
    1453              :    appropriate constraints.  */
    1454              : 
    1455              : bool
    1456    161606949 : ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
    1457              :                          rtx operands[3], bool use_ndd)
    1458              : {
    1459    161606949 :   rtx dst = operands[0];
    1460    161606949 :   rtx src1 = operands[1];
    1461    161606949 :   rtx src2 = operands[2];
    1462              : 
    1463              :   /* Both source operands cannot be in memory.  */
    1464    154249898 :   if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
    1465    161607334 :       && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
    1466       998785 :     return false;
    1467              : 
    1468              :   /* Canonicalize operand order for commutative operators.  */
    1469    160608164 :   if (ix86_swap_binary_operands_p (code, mode, operands))
    1470       534042 :     std::swap (src1, src2);
    1471              : 
    1472              :   /* If the destination is memory, we must have a matching source operand.  */
    1473    160608164 :   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
    1474              :     return false;
    1475              : 
    1476              :   /* Source 1 cannot be a constant.  */
    1477    155574993 :   if (CONSTANT_P (src1))
    1478              :     return false;
    1479              : 
    1480              :   /* Source 1 cannot be a non-matching memory.  */
    1481    155571944 :   if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
    1482              :     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
    1483      4405557 :     return (code == AND
    1484       510996 :             && (mode == HImode
    1485       510996 :                 || mode == SImode
    1486       306319 :                 || (TARGET_64BIT && mode == DImode))
    1487      4705379 :             && satisfies_constraint_L (src2));
    1488              : 
    1489              :   return true;
    1490              : }
    1491              : 
    1492              : /* Attempt to expand a unary operator.  Make the expansion closer to the
    1493              :    actual machine, then just general_operand, which will allow 2 separate
    1494              :    memory references (one output, one input) in a single insn.  */
    1495              : 
    1496              : void
    1497       120568 : ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
    1498              :                             rtx operands[], bool use_ndd)
    1499              : {
    1500       120568 :   bool matching_memory = false;
    1501       120568 :   rtx src, dst, op, clob;
    1502              : 
    1503       120568 :   dst = operands[0];
    1504       120568 :   src = operands[1];
    1505              : 
    1506              :   /* If the destination is memory, and we do not have matching source
    1507              :      operands, do things in registers.  */
    1508       120568 :   if (MEM_P (dst))
    1509              :     {
    1510         3350 :       if (rtx_equal_p (dst, src))
    1511              :         matching_memory = true;
    1512              :       else
    1513         3034 :         dst = gen_reg_rtx (mode);
    1514              :     }
    1515              : 
    1516              :   /* When source operand is memory, destination must match.  */
    1517       120568 :   if (!use_ndd && MEM_P (src) && !matching_memory)
    1518         4661 :     src = force_reg (mode, src);
    1519              : 
    1520              :   /* Emit the instruction.  */
    1521              : 
    1522       120568 :   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
    1523              : 
    1524       120568 :   if (code == NOT)
    1525        69733 :     emit_insn (op);
    1526              :   else
    1527              :     {
    1528        50835 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1529        50835 :       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1530              :     }
    1531              : 
    1532              :   /* Fix up the destination if needed.  */
    1533       120568 :   if (dst != operands[0])
    1534         3034 :     emit_move_insn (operands[0], dst);
    1535       120568 : }
    1536              : 
    1537              : /* Return TRUE or FALSE depending on whether the unary operator meets the
    1538              :    appropriate constraints.  */
    1539              : 
    1540              : bool
    1541      1754446 : ix86_unary_operator_ok (enum rtx_code,
    1542              :                         machine_mode,
    1543              :                         rtx operands[2],
    1544              :                         bool use_ndd)
    1545              : {
    1546              :   /* If one of operands is memory, source and destination must match.  */
    1547      1754446 :   if ((MEM_P (operands[0])
    1548      1709677 :        || (!use_ndd && MEM_P (operands[1])))
    1549      1783295 :       && ! rtx_equal_p (operands[0], operands[1]))
    1550              :     return false;
    1551              :   return true;
    1552              : }
    1553              : 
    1554              : /* Predict just emitted jump instruction to be taken with probability PROB.  */
    1555              : 
    1556              : static void
    1557        66402 : predict_jump (int prob)
    1558              : {
    1559        66402 :   rtx_insn *insn = get_last_insn ();
    1560        66402 :   gcc_assert (JUMP_P (insn));
    1561        66402 :   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
    1562        66402 : }
    1563              : 
    1564              : /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
    1565              :    divisor are within the range [0-255].  */
    1566              : 
    1567              : void
    1568           27 : ix86_split_idivmod (machine_mode mode, rtx operands[],
    1569              :                     bool unsigned_p)
    1570              : {
    1571           27 :   rtx_code_label *end_label, *qimode_label;
    1572           27 :   rtx div, mod;
    1573           27 :   rtx_insn *insn;
    1574           27 :   rtx scratch, tmp0, tmp1, tmp2;
    1575           27 :   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
    1576              : 
    1577           27 :   operands[2] = force_reg (mode, operands[2]);
    1578           27 :   operands[3] = force_reg (mode, operands[3]);
    1579              : 
    1580           27 :   switch (mode)
    1581              :     {
    1582           20 :     case E_SImode:
    1583           20 :       if (GET_MODE (operands[0]) == SImode)
    1584              :         {
    1585           16 :           if (GET_MODE (operands[1]) == SImode)
    1586           14 :             gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
    1587              :           else
    1588            2 :             gen_divmod4_1
    1589            2 :               = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
    1590              :         }
    1591              :       else
    1592            4 :         gen_divmod4_1
    1593            4 :           = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
    1594              :       break;
    1595              : 
    1596            7 :     case E_DImode:
    1597            7 :       gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
    1598              :       break;
    1599              : 
    1600            0 :     default:
    1601            0 :       gcc_unreachable ();
    1602              :     }
    1603              : 
    1604           27 :   end_label = gen_label_rtx ();
    1605           27 :   qimode_label = gen_label_rtx ();
    1606              : 
    1607           27 :   scratch = gen_reg_rtx (mode);
    1608              : 
    1609              :   /* Use 8bit unsigned divimod if dividend and divisor are within
    1610              :      the range [0-255].  */
    1611           27 :   emit_move_insn (scratch, operands[2]);
    1612           27 :   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
    1613              :                                  scratch, 1, OPTAB_DIRECT);
    1614           27 :   emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
    1615           27 :   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
    1616           27 :   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
    1617           27 :   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
    1618              :                                gen_rtx_LABEL_REF (VOIDmode, qimode_label),
    1619              :                                pc_rtx);
    1620           27 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
    1621           27 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
    1622           27 :   JUMP_LABEL (insn) = qimode_label;
    1623              : 
    1624              :   /* Generate original signed/unsigned divimod.  */
    1625           27 :   emit_insn (gen_divmod4_1 (operands[0], operands[1],
    1626              :                             operands[2], operands[3]));
    1627              : 
    1628              :   /* Branch to the end.  */
    1629           27 :   emit_jump_insn (gen_jump (end_label));
    1630           27 :   emit_barrier ();
    1631              : 
    1632              :   /* Generate 8bit unsigned divide.  */
    1633           27 :   emit_label (qimode_label);
    1634              :   /* Don't use operands[0] for result of 8bit divide since not all
    1635              :      registers support QImode ZERO_EXTRACT.  */
    1636           27 :   tmp0 = lowpart_subreg (HImode, scratch, mode);
    1637           27 :   tmp1 = lowpart_subreg (HImode, operands[2], mode);
    1638           27 :   tmp2 = lowpart_subreg (QImode, operands[3], mode);
    1639           27 :   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
    1640              : 
    1641           27 :   if (unsigned_p)
    1642              :     {
    1643           12 :       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
    1644           12 :       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
    1645              :     }
    1646              :   else
    1647              :     {
    1648           15 :       div = gen_rtx_DIV (mode, operands[2], operands[3]);
    1649           15 :       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
    1650              :     }
    1651           27 :   if (mode == SImode)
    1652              :     {
    1653           20 :       if (GET_MODE (operands[0]) != SImode)
    1654            4 :         div = gen_rtx_ZERO_EXTEND (DImode, div);
    1655           20 :       if (GET_MODE (operands[1]) != SImode)
    1656            2 :         mod = gen_rtx_ZERO_EXTEND (DImode, mod);
    1657              :     }
    1658              : 
    1659              :   /* Extract remainder from AH.  */
    1660           27 :   scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
    1661           27 :   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
    1662              :                                GEN_INT (8), GEN_INT (8));
    1663           27 :   insn = emit_move_insn (operands[1], tmp1);
    1664           27 :   set_unique_reg_note (insn, REG_EQUAL, mod);
    1665              : 
    1666              :   /* Zero extend quotient from AL.  */
    1667           27 :   tmp1 = gen_lowpart (QImode, tmp0);
    1668           27 :   insn = emit_insn (gen_extend_insn
    1669           27 :                     (operands[0], tmp1,
    1670           27 :                      GET_MODE (operands[0]), QImode, 1));
    1671           27 :   set_unique_reg_note (insn, REG_EQUAL, div);
    1672              : 
    1673           27 :   emit_label (end_label);
    1674           27 : }
    1675              : 
    1676              : /* Emit x86 binary operand CODE in mode MODE, where the first operand
    1677              :    matches destination.  RTX includes clobber of FLAGS_REG.  */
    1678              : 
    1679              : void
    1680         7734 : ix86_emit_binop (enum rtx_code code, machine_mode mode,
    1681              :                  rtx dst, rtx src)
    1682              : {
    1683         7734 :   rtx op, clob;
    1684              : 
    1685         7734 :   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
    1686         7734 :   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1687              : 
    1688         7734 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1689         7734 : }
    1690              : 
    1691              : /* Return true if regno1 def is nearest to the insn.  */
    1692              : 
    1693              : static bool
    1694           15 : find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
    1695              : {
    1696           15 :   rtx_insn *prev = insn;
    1697           15 :   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
    1698              : 
    1699           15 :   if (insn == start)
    1700              :     return false;
    1701           40 :   while (prev && prev != start)
    1702              :     {
    1703           30 :       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
    1704              :         {
    1705           10 :           prev = PREV_INSN (prev);
    1706           10 :           continue;
    1707              :         }
    1708           20 :       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
    1709              :         return true;
    1710           15 :       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
    1711              :         return false;
    1712           15 :       prev = PREV_INSN (prev);
    1713              :     }
    1714              : 
    1715              :   /* None of the regs is defined in the bb.  */
    1716              :   return false;
    1717              : }
    1718              : 
    1719              : /* INSN_UID of the last insn emitted by zero store peephole2s.  */
    1720              : int ix86_last_zero_store_uid;
    1721              : 
    1722              : /* Split lea instructions into a sequence of instructions
    1723              :    which are executed on ALU to avoid AGU stalls.
    1724              :    It is assumed that it is allowed to clobber flags register
    1725              :    at lea position.  */
    1726              : 
    1727              : void
    1728         5915 : ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
    1729              : {
    1730         5915 :   unsigned int regno0, regno1, regno2;
    1731         5915 :   struct ix86_address parts;
    1732         5915 :   rtx target, tmp;
    1733         5915 :   int ok, adds;
    1734              : 
    1735         5915 :   ok = ix86_decompose_address (operands[1], &parts);
    1736         5915 :   gcc_assert (ok);
    1737              : 
    1738         5915 :   target = gen_lowpart (mode, operands[0]);
    1739              : 
    1740         5915 :   regno0 = true_regnum (target);
    1741         5915 :   regno1 = INVALID_REGNUM;
    1742         5915 :   regno2 = INVALID_REGNUM;
    1743              : 
    1744         5915 :   if (parts.base)
    1745              :     {
    1746         5907 :       parts.base = gen_lowpart (mode, parts.base);
    1747         5907 :       regno1 = true_regnum (parts.base);
    1748              :     }
    1749              : 
    1750         5915 :   if (parts.index)
    1751              :     {
    1752         5912 :       parts.index = gen_lowpart (mode, parts.index);
    1753         5912 :       regno2 = true_regnum (parts.index);
    1754              :     }
    1755              : 
    1756         5915 :   if (parts.disp)
    1757          190 :     parts.disp = gen_lowpart (mode, parts.disp);
    1758              : 
    1759         5915 :   if (parts.scale > 1)
    1760              :     {
    1761              :       /* Case r1 = r1 + ...  */
    1762           11 :       if (regno1 == regno0)
    1763              :         {
    1764              :           /* If we have a case r1 = r1 + C * r2 then we
    1765              :              should use multiplication which is very
    1766              :              expensive.  Assume cost model is wrong if we
    1767              :              have such case here.  */
    1768            0 :           gcc_assert (regno2 != regno0);
    1769              : 
    1770            0 :           for (adds = parts.scale; adds > 0; adds--)
    1771            0 :             ix86_emit_binop (PLUS, mode, target, parts.index);
    1772              :         }
    1773              :       else
    1774              :         {
    1775              :           /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
    1776           11 :           if (regno0 != regno2)
    1777            8 :             emit_insn (gen_rtx_SET (target, parts.index));
    1778              : 
    1779              :           /* Use shift for scaling, but emit it as MULT instead
    1780              :              to avoid it being immediately peephole2 optimized back
    1781              :              into lea.  */
    1782           11 :           ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
    1783              : 
    1784           11 :           if (parts.base)
    1785            3 :             ix86_emit_binop (PLUS, mode, target, parts.base);
    1786              : 
    1787           11 :           if (parts.disp && parts.disp != const0_rtx)
    1788            3 :             ix86_emit_binop (PLUS, mode, target, parts.disp);
    1789              :         }
    1790              :     }
    1791         5904 :   else if (!parts.base && !parts.index)
    1792              :     {
    1793            0 :       gcc_assert(parts.disp);
    1794            0 :       emit_insn (gen_rtx_SET (target, parts.disp));
    1795              :     }
    1796              :   else
    1797              :     {
    1798         5904 :       if (!parts.base)
    1799              :         {
    1800            0 :           if (regno0 != regno2)
    1801            0 :             emit_insn (gen_rtx_SET (target, parts.index));
    1802              :         }
    1803         5904 :       else if (!parts.index)
    1804              :         {
    1805            3 :           if (regno0 != regno1)
    1806            1 :             emit_insn (gen_rtx_SET (target, parts.base));
    1807              :         }
    1808              :       else
    1809              :         {
    1810         5901 :           if (regno0 == regno1)
    1811              :             tmp = parts.index;
    1812         2972 :           else if (regno0 == regno2)
    1813              :             tmp = parts.base;
    1814              :           else
    1815              :             {
    1816           15 :               rtx tmp1;
    1817              : 
    1818              :               /* Find better operand for SET instruction, depending
    1819              :                  on which definition is farther from the insn.  */
    1820           15 :               if (find_nearest_reg_def (insn, regno1, regno2))
    1821            5 :                 tmp = parts.index, tmp1 = parts.base;
    1822              :               else
    1823           10 :                 tmp = parts.base, tmp1 = parts.index;
    1824              : 
    1825           15 :               emit_insn (gen_rtx_SET (target, tmp));
    1826              : 
    1827           15 :               if (parts.disp && parts.disp != const0_rtx)
    1828            0 :                 ix86_emit_binop (PLUS, mode, target, parts.disp);
    1829              : 
    1830           15 :               ix86_emit_binop (PLUS, mode, target, tmp1);
    1831           15 :               return;
    1832              :             }
    1833              : 
    1834         5886 :           ix86_emit_binop (PLUS, mode, target, tmp);
    1835              :         }
    1836              : 
    1837         5889 :       if (parts.disp && parts.disp != const0_rtx)
    1838            4 :         ix86_emit_binop (PLUS, mode, target, parts.disp);
    1839              :     }
    1840              : }
    1841              : 
    1842              : /* Post-reload splitter for converting an SF or DFmode value in an
    1843              :    SSE register into an unsigned SImode.  */
    1844              : 
    1845              : void
    1846            0 : ix86_split_convert_uns_si_sse (rtx operands[])
    1847              : {
    1848            0 :   machine_mode vecmode;
    1849            0 :   rtx value, large, zero_or_two31, input, two31, x;
    1850              : 
    1851            0 :   large = operands[1];
    1852            0 :   zero_or_two31 = operands[2];
    1853            0 :   input = operands[3];
    1854            0 :   two31 = operands[4];
    1855            0 :   vecmode = GET_MODE (large);
    1856            0 :   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
    1857              : 
    1858              :   /* Load up the value into the low element.  We must ensure that the other
    1859              :      elements are valid floats -- zero is the easiest such value.  */
    1860            0 :   if (MEM_P (input))
    1861              :     {
    1862            0 :       if (vecmode == V4SFmode)
    1863            0 :         emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
    1864              :       else
    1865            0 :         emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
    1866              :     }
    1867              :   else
    1868              :     {
    1869            0 :       input = gen_rtx_REG (vecmode, REGNO (input));
    1870            0 :       emit_move_insn (value, CONST0_RTX (vecmode));
    1871            0 :       if (vecmode == V4SFmode)
    1872            0 :         emit_insn (gen_sse_movss_v4sf (value, value, input));
    1873              :       else
    1874            0 :         emit_insn (gen_sse2_movsd_v2df (value, value, input));
    1875              :     }
    1876              : 
    1877            0 :   emit_move_insn (large, two31);
    1878            0 :   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
    1879              : 
    1880            0 :   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
    1881            0 :   emit_insn (gen_rtx_SET (large, x));
    1882              : 
    1883            0 :   x = gen_rtx_AND (vecmode, zero_or_two31, large);
    1884            0 :   emit_insn (gen_rtx_SET (zero_or_two31, x));
    1885              : 
    1886            0 :   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
    1887            0 :   emit_insn (gen_rtx_SET (value, x));
    1888              : 
    1889            0 :   large = gen_rtx_REG (V4SImode, REGNO (large));
    1890            0 :   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
    1891              : 
    1892            0 :   x = gen_rtx_REG (V4SImode, REGNO (value));
    1893            0 :   if (vecmode == V4SFmode)
    1894            0 :     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
    1895              :   else
    1896            0 :     emit_insn (gen_sse2_cvttpd2dq (x, value));
    1897            0 :   value = x;
    1898              : 
    1899            0 :   emit_insn (gen_xorv4si3 (value, value, large));
    1900            0 : }
    1901              : 
    1902              : /* Convert an unsigned DImode value into a DFmode, using only SSE.
    1903              :    Expects the 64-bit DImode to be supplied in a pair of integral
    1904              :    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
    1905              :    -mfpmath=sse, !optimize_size only.  */
    1906              : 
    1907              : void
    1908            0 : ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
    1909              : {
    1910            0 :   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
    1911            0 :   rtx int_xmm, fp_xmm;
    1912            0 :   rtx biases, exponents;
    1913            0 :   rtx x;
    1914              : 
    1915            0 :   int_xmm = gen_reg_rtx (V4SImode);
    1916            0 :   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
    1917            0 :     emit_insn (gen_movdi_to_sse (int_xmm, input));
    1918            0 :   else if (TARGET_SSE_SPLIT_REGS)
    1919              :     {
    1920            0 :       emit_clobber (int_xmm);
    1921            0 :       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
    1922              :     }
    1923              :   else
    1924              :     {
    1925            0 :       x = gen_reg_rtx (V2DImode);
    1926            0 :       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
    1927            0 :       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
    1928              :     }
    1929              : 
    1930            0 :   x = gen_rtx_CONST_VECTOR (V4SImode,
    1931              :                             gen_rtvec (4, GEN_INT (0x43300000UL),
    1932              :                                        GEN_INT (0x45300000UL),
    1933              :                                        const0_rtx, const0_rtx));
    1934            0 :   exponents = validize_mem (force_const_mem (V4SImode, x));
    1935              : 
    1936              :   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
    1937            0 :   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
    1938              : 
    1939              :   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
    1940              :      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
    1941              :      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
    1942              :      (0x1.0p84 + double(fp_value_hi_xmm)).
    1943              :      Note these exponents differ by 32.  */
    1944              : 
    1945            0 :   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
    1946              : 
    1947              :   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
    1948              :      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
    1949            0 :   real_ldexp (&bias_lo_rvt, &dconst1, 52);
    1950            0 :   real_ldexp (&bias_hi_rvt, &dconst1, 84);
    1951            0 :   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
    1952            0 :   x = const_double_from_real_value (bias_hi_rvt, DFmode);
    1953            0 :   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
    1954            0 :   biases = validize_mem (force_const_mem (V2DFmode, biases));
    1955            0 :   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
    1956              : 
    1957              :   /* Add the upper and lower DFmode values together.  */
    1958            0 :   if (TARGET_SSE3)
    1959            0 :     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
    1960              :   else
    1961              :     {
    1962            0 :       x = copy_to_mode_reg (V2DFmode, fp_xmm);
    1963            0 :       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
    1964            0 :       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
    1965              :     }
    1966              : 
    1967            0 :   ix86_expand_vector_extract (false, target, fp_xmm, 0);
    1968            0 : }
    1969              : 
    1970              : /* Not used, but eases macroization of patterns.  */
    1971              : void
    1972            0 : ix86_expand_convert_uns_sixf_sse (rtx, rtx)
    1973              : {
    1974            0 :   gcc_unreachable ();
    1975              : }
    1976              : 
    1977              : static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
    1978              : 
    1979              : /* Convert an unsigned SImode value into a DFmode.  Only currently used
    1980              :    for SSE, but applicable anywhere.  */
    1981              : 
    1982              : void
    1983            0 : ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
    1984              : {
    1985            0 :   REAL_VALUE_TYPE TWO31r;
    1986            0 :   rtx x, fp;
    1987              : 
    1988            0 :   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
    1989              :                            NULL, 1, OPTAB_DIRECT);
    1990              : 
    1991            0 :   fp = gen_reg_rtx (DFmode);
    1992            0 :   emit_insn (gen_floatsidf2 (fp, x));
    1993              : 
    1994            0 :   real_ldexp (&TWO31r, &dconst1, 31);
    1995            0 :   x = const_double_from_real_value (TWO31r, DFmode);
    1996              : 
    1997            0 :   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
    1998              : 
    1999              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
    2000            0 :   if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
    2001            0 :     x = ix86_expand_sse_fabs (x, NULL);
    2002              : 
    2003            0 :   if (x != target)
    2004            0 :     emit_move_insn (target, x);
    2005            0 : }
    2006              : 
    2007              : /* Convert a signed DImode value into a DFmode.  Only used for SSE in
    2008              :    32-bit mode; otherwise we have a direct convert instruction.  */
    2009              : 
    2010              : void
    2011            0 : ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
    2012              : {
    2013            0 :   REAL_VALUE_TYPE TWO32r;
    2014            0 :   rtx fp_lo, fp_hi, x;
    2015              : 
    2016            0 :   fp_lo = gen_reg_rtx (DFmode);
    2017            0 :   fp_hi = gen_reg_rtx (DFmode);
    2018              : 
    2019            0 :   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
    2020              : 
    2021            0 :   real_ldexp (&TWO32r, &dconst1, 32);
    2022            0 :   x = const_double_from_real_value (TWO32r, DFmode);
    2023            0 :   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
    2024              : 
    2025            0 :   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
    2026              : 
    2027            0 :   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
    2028              :                            0, OPTAB_DIRECT);
    2029            0 :   if (x != target)
    2030            0 :     emit_move_insn (target, x);
    2031            0 : }
    2032              : 
    2033              : /* Convert an unsigned SImode value into a SFmode, using only SSE.
    2034              :    For x86_32, -mfpmath=sse, !optimize_size only.  */
    2035              : void
    2036            0 : ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
    2037              : {
    2038            0 :   REAL_VALUE_TYPE ONE16r;
    2039            0 :   rtx fp_hi, fp_lo, int_hi, int_lo, x;
    2040              : 
    2041            0 :   real_ldexp (&ONE16r, &dconst1, 16);
    2042            0 :   x = const_double_from_real_value (ONE16r, SFmode);
    2043            0 :   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
    2044              :                                       NULL, 0, OPTAB_DIRECT);
    2045            0 :   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
    2046              :                                       NULL, 0, OPTAB_DIRECT);
    2047            0 :   fp_hi = gen_reg_rtx (SFmode);
    2048            0 :   fp_lo = gen_reg_rtx (SFmode);
    2049            0 :   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
    2050            0 :   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
    2051            0 :   if (TARGET_FMA)
    2052              :     {
    2053            0 :       x = validize_mem (force_const_mem (SFmode, x));
    2054            0 :       fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
    2055            0 :       emit_move_insn (target, fp_hi);
    2056              :     }
    2057              :   else
    2058              :     {
    2059            0 :       fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
    2060              :                                    0, OPTAB_DIRECT);
    2061            0 :       fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
    2062              :                                    0, OPTAB_DIRECT);
    2063            0 :       if (!rtx_equal_p (target, fp_hi))
    2064            0 :         emit_move_insn (target, fp_hi);
    2065              :     }
    2066            0 : }
    2067              : 
    2068              : /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
    2069              :    a vector of unsigned ints VAL to vector of floats TARGET.  */
    2070              : 
    2071              : void
    2072           54 : ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
    2073              : {
    2074           54 :   rtx tmp[8];
    2075           54 :   REAL_VALUE_TYPE TWO16r;
    2076           54 :   machine_mode intmode = GET_MODE (val);
    2077           54 :   machine_mode fltmode = GET_MODE (target);
    2078           54 :   rtx (*cvt) (rtx, rtx);
    2079              : 
    2080           54 :   if (intmode == V4SImode)
    2081              :     cvt = gen_floatv4siv4sf2;
    2082              :   else
    2083            2 :     cvt = gen_floatv8siv8sf2;
    2084           54 :   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
    2085           54 :   tmp[0] = force_reg (intmode, tmp[0]);
    2086           54 :   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
    2087              :                                 OPTAB_DIRECT);
    2088           54 :   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
    2089              :                                 NULL_RTX, 1, OPTAB_DIRECT);
    2090           54 :   tmp[3] = gen_reg_rtx (fltmode);
    2091           54 :   emit_insn (cvt (tmp[3], tmp[1]));
    2092           54 :   tmp[4] = gen_reg_rtx (fltmode);
    2093           54 :   emit_insn (cvt (tmp[4], tmp[2]));
    2094           54 :   real_ldexp (&TWO16r, &dconst1, 16);
    2095           54 :   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
    2096           54 :   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
    2097           54 :   if (TARGET_FMA)
    2098              :     {
    2099            1 :       tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
    2100            1 :       emit_move_insn (target, tmp[6]);
    2101              :     }
    2102              :   else
    2103              :     {
    2104           53 :       tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
    2105              :                                     NULL_RTX, 1, OPTAB_DIRECT);
    2106           53 :       tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
    2107              :                                     target, 1, OPTAB_DIRECT);
    2108           53 :       if (tmp[7] != target)
    2109            0 :         emit_move_insn (target, tmp[7]);
    2110              :     }
    2111           54 : }
    2112              : 
    2113              : /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
    2114              :    pattern can be used on it instead of fixuns_trunc*.
    2115              :    This is done by doing just signed conversion if < 0x1p31, and otherwise by
    2116              :    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
    2117              : 
    2118              : rtx
    2119          286 : ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
    2120              : {
    2121          286 :   REAL_VALUE_TYPE TWO31r;
    2122          286 :   rtx two31r, tmp[4];
    2123          286 :   machine_mode mode = GET_MODE (val);
    2124          286 :   machine_mode scalarmode = GET_MODE_INNER (mode);
    2125          572 :   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
    2126          286 :   rtx (*cmp) (rtx, rtx, rtx, rtx);
    2127          286 :   int i;
    2128              : 
    2129         1144 :   for (i = 0; i < 3; i++)
    2130          858 :     tmp[i] = gen_reg_rtx (mode);
    2131          286 :   real_ldexp (&TWO31r, &dconst1, 31);
    2132          286 :   two31r = const_double_from_real_value (TWO31r, scalarmode);
    2133          286 :   two31r = ix86_build_const_vector (mode, 1, two31r);
    2134          286 :   two31r = force_reg (mode, two31r);
    2135          286 :   switch (mode)
    2136              :     {
    2137              :     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
    2138           10 :     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
    2139           16 :     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
    2140          260 :     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
    2141            0 :     default: gcc_unreachable ();
    2142              :     }
    2143          286 :   tmp[3] = gen_rtx_LE (mode, two31r, val);
    2144          286 :   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
    2145          286 :   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
    2146              :                                 0, OPTAB_DIRECT);
    2147          286 :   if (intmode == V4SImode || TARGET_AVX2)
    2148          572 :     *xorp = expand_simple_binop (intmode, ASHIFT,
    2149          286 :                                  gen_lowpart (intmode, tmp[0]),
    2150              :                                  GEN_INT (31), NULL_RTX, 0,
    2151              :                                  OPTAB_DIRECT);
    2152              :   else
    2153              :     {
    2154            0 :       rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
    2155            0 :       two31 = ix86_build_const_vector (intmode, 1, two31);
    2156            0 :       *xorp = expand_simple_binop (intmode, AND,
    2157            0 :                                    gen_lowpart (intmode, tmp[0]),
    2158              :                                    two31, NULL_RTX, 0,
    2159              :                                    OPTAB_DIRECT);
    2160              :     }
    2161          286 :   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
    2162          286 :                               0, OPTAB_DIRECT);
    2163              : }
    2164              : 
    2165              : /* Generate code for floating point ABS or NEG.  */
    2166              : 
    2167              : void
    2168        32875 : ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
    2169              :                                 rtx operands[])
    2170              : {
    2171        32875 :   rtx set, dst, src;
    2172        32875 :   bool use_sse = false;
    2173        32875 :   bool vector_mode = VECTOR_MODE_P (mode);
    2174        32875 :   machine_mode vmode = mode;
    2175        32875 :   rtvec par;
    2176              : 
    2177        32875 :   switch (mode)
    2178              :   {
    2179              :   case E_HFmode:
    2180              :     use_sse = true;
    2181              :     vmode = V8HFmode;
    2182              :     break;
    2183            0 :   case E_BFmode:
    2184            0 :     use_sse = true;
    2185            0 :     vmode = V8BFmode;
    2186            0 :     break;
    2187         8986 :   case E_SFmode:
    2188         8986 :     use_sse = TARGET_SSE_MATH && TARGET_SSE;
    2189              :     vmode = V4SFmode;
    2190              :     break;
    2191        15427 :   case E_DFmode:
    2192        15427 :     use_sse = TARGET_SSE_MATH && TARGET_SSE2;
    2193              :     vmode = V2DFmode;
    2194              :     break;
    2195         8263 :   default:
    2196         8263 :     use_sse = vector_mode || mode == TFmode;
    2197         8263 :     break;
    2198              :   }
    2199              : 
    2200        32875 :   dst = operands[0];
    2201        32875 :   src = operands[1];
    2202              : 
    2203        32875 :   set = gen_rtx_fmt_e (code, mode, src);
    2204        32875 :   set = gen_rtx_SET (dst, set);
    2205              : 
    2206        32875 :   if (use_sse)
    2207              :     {
    2208        27702 :       rtx mask, use, clob;
    2209              : 
    2210              :       /* NEG and ABS performed with SSE use bitwise mask operations.
    2211              :          Create the appropriate mask now.  */
    2212        27702 :       mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
    2213        27702 :       use = gen_rtx_USE (VOIDmode, mask);
    2214        27702 :       if (vector_mode || mode == TFmode)
    2215         4411 :         par = gen_rtvec (2, set, use);
    2216              :       else
    2217              :         {
    2218        23291 :           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2219        23291 :           par = gen_rtvec (3, set, use, clob);
    2220              :         }
    2221              :     }
    2222              :   else
    2223              :     {
    2224         5173 :       rtx clob;
    2225              : 
    2226              :       /* Changing of sign for FP values is doable using integer unit too.  */
    2227         5173 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2228         5173 :       par = gen_rtvec (2, set, clob);
    2229              :     }
    2230              : 
    2231        32875 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
    2232        32875 : }
    2233              : 
    2234              : /* Deconstruct a floating point ABS or NEG operation
    2235              :    with integer registers into integer operations.  */
    2236              : 
    2237              : void
    2238           24 : ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
    2239              :                                rtx operands[])
    2240              : {
    2241           24 :   enum rtx_code absneg_op;
    2242           24 :   rtx dst, set;
    2243              : 
    2244           24 :   gcc_assert (operands_match_p (operands[0], operands[1]));
    2245              : 
    2246           24 :   switch (mode)
    2247              :     {
    2248            0 :     case E_SFmode:
    2249            0 :       dst = gen_lowpart (SImode, operands[0]);
    2250              : 
    2251            0 :       if (code == ABS)
    2252              :         {
    2253            0 :           set = gen_int_mode (0x7fffffff, SImode);
    2254            0 :           absneg_op = AND;
    2255              :         }
    2256              :       else
    2257              :         {
    2258            0 :           set = gen_int_mode (0x80000000, SImode);
    2259            0 :           absneg_op = XOR;
    2260              :         }
    2261            0 :       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2262            0 :       break;
    2263              : 
    2264            1 :     case E_DFmode:
    2265            1 :       if (TARGET_64BIT)
    2266              :         {
    2267            1 :           dst = gen_lowpart (DImode, operands[0]);
    2268            1 :           dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
    2269              : 
    2270            1 :           if (code == ABS)
    2271            0 :             set = const0_rtx;
    2272              :           else
    2273            1 :             set = gen_rtx_NOT (DImode, dst);
    2274              :         }
    2275              :       else
    2276              :         {
    2277            0 :           dst = gen_highpart (SImode, operands[0]);
    2278              : 
    2279            0 :           if (code == ABS)
    2280              :             {
    2281            0 :               set = gen_int_mode (0x7fffffff, SImode);
    2282            0 :               absneg_op = AND;
    2283              :             }
    2284              :           else
    2285              :             {
    2286            0 :               set = gen_int_mode (0x80000000, SImode);
    2287            0 :               absneg_op = XOR;
    2288              :             }
    2289            0 :           set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2290              :         }
    2291              :       break;
    2292              : 
    2293           23 :     case E_XFmode:
    2294           23 :       dst = gen_rtx_REG (SImode,
    2295           23 :                          REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
    2296           23 :       if (code == ABS)
    2297              :         {
    2298            1 :           set = GEN_INT (0x7fff);
    2299            1 :           absneg_op = AND;
    2300              :         }
    2301              :       else
    2302              :         {
    2303           22 :           set = GEN_INT (0x8000);
    2304           22 :           absneg_op = XOR;
    2305              :         }
    2306           23 :       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2307           23 :       break;
    2308              : 
    2309            0 :     default:
    2310            0 :       gcc_unreachable ();
    2311              :     }
    2312              : 
    2313           24 :   set = gen_rtx_SET (dst, set);
    2314              : 
    2315           24 :   rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2316           24 :   rtvec par = gen_rtvec (2, set, clob);
    2317              : 
    2318           24 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
    2319           24 : }
    2320              : 
    2321              : /* Expand a copysign operation.  Special case operand 0 being a constant.  */
    2322              : 
    2323              : void
    2324        23236 : ix86_expand_copysign (rtx operands[])
    2325              : {
    2326        23236 :   machine_mode mode, vmode;
    2327        23236 :   rtx dest, vdest, op0, op1, mask, op2, op3;
    2328              : 
    2329        23236 :   mode = GET_MODE (operands[0]);
    2330              : 
    2331        23236 :   switch (mode)
    2332              :   {
    2333              :   case E_HFmode:
    2334              :     vmode = V8HFmode;
    2335              :     break;
    2336            0 :   case E_BFmode:
    2337            0 :     vmode = V8BFmode;
    2338            0 :     break;
    2339        11562 :   case E_SFmode:
    2340        11562 :     vmode = V4SFmode;
    2341        11562 :     break;
    2342        11535 :   case E_DFmode:
    2343        11535 :     vmode = V2DFmode;
    2344        11535 :     break;
    2345          127 :   case E_TFmode:
    2346          127 :     vmode = mode;
    2347          127 :     break;
    2348            0 :   default:
    2349            0 :     gcc_unreachable();
    2350              :   }
    2351              : 
    2352        23236 :   if (rtx_equal_p (operands[1], operands[2]))
    2353              :     {
    2354            0 :       emit_move_insn (operands[0], operands[1]);
    2355            0 :       return;
    2356              :     }
    2357              : 
    2358        23236 :   dest = operands[0];
    2359        23236 :   vdest = lowpart_subreg (vmode, dest, mode);
    2360        23236 :   if (vdest == NULL_RTX)
    2361            0 :     vdest = gen_reg_rtx (vmode);
    2362              :   else
    2363              :     dest = NULL_RTX;
    2364        23236 :   op1 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
    2365        46458 :   mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
    2366              : 
    2367        23236 :   if (CONST_DOUBLE_P (operands[2]))
    2368              :     {
    2369           79 :       if (real_isneg (CONST_DOUBLE_REAL_VALUE (operands[2])))
    2370              :         /* Simplify b = copysign (a, negative) to b = mask | a.  */
    2371           76 :         op1 = gen_rtx_IOR (vmode, mask, op1);
    2372              :       else
    2373              :         {
    2374              :           /* Simplify b = copysign (a, positive) to b = invert_mask & a.  */
    2375            3 :           rtx invert_mask
    2376            3 :             = ix86_build_signbit_mask (vmode,
    2377            3 :                                        TARGET_AVX512F && mode != HFmode,
    2378              :                                        true);
    2379            3 :           op1 = gen_rtx_AND (vmode, invert_mask, op1);
    2380              :         }
    2381           79 :       emit_move_insn (vdest, op1);
    2382           79 :       if (dest)
    2383            0 :         emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2384           79 :       return;
    2385              :     }
    2386              :   else
    2387        23157 :     op0 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
    2388              : 
    2389        23157 :   op2 = gen_reg_rtx (vmode);
    2390        23157 :   op3 = gen_reg_rtx (vmode);
    2391        23157 :   rtx invert_mask;
    2392              :   /* NB: Generate vmovdqa, vpandn, vpand, vpor for AVX and generate pand,
    2393              :      pand, por for SSE.  */
    2394        23157 :   if (TARGET_AVX)
    2395           31 :     invert_mask = gen_rtx_NOT (vmode, mask);
    2396              :   else
    2397        23126 :     invert_mask = ix86_build_signbit_mask (vmode,
    2398        23126 :                                            TARGET_AVX512F && mode != HFmode,
    2399              :                                            true);
    2400        23157 :   emit_move_insn (op2, gen_rtx_AND (vmode, invert_mask, op1));
    2401        23157 :   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op0));
    2402        23157 :   emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
    2403        23157 :   if (dest)
    2404            0 :     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2405              : }
    2406              : 
    2407              : /* Expand an xorsign operation.  */
    2408              : 
    2409              : void
    2410           20 : ix86_expand_xorsign (rtx operands[])
    2411              : {
    2412           20 :   machine_mode mode, vmode;
    2413           20 :   rtx dest, vdest, op0, op1, mask, x, temp;
    2414              : 
    2415           20 :   dest = operands[0];
    2416           20 :   op0 = operands[1];
    2417           20 :   op1 = operands[2];
    2418              : 
    2419           20 :   mode = GET_MODE (dest);
    2420              : 
    2421           20 :   switch (mode)
    2422              :   {
    2423              :   case E_HFmode:
    2424              :     vmode = V8HFmode;
    2425              :     break;
    2426              :   case E_BFmode:
    2427              :     vmode = V8BFmode;
    2428              :     break;
    2429              :   case E_SFmode:
    2430              :     vmode = V4SFmode;
    2431              :     break;
    2432              :   case E_DFmode:
    2433              :     vmode = V2DFmode;
    2434              :     break;
    2435            0 :   default:
    2436            0 :     gcc_unreachable ();
    2437           20 :     break;
    2438              :   }
    2439              : 
    2440           20 :   temp = gen_reg_rtx (vmode);
    2441           20 :   mask = ix86_build_signbit_mask (vmode, 0, 0);
    2442              : 
    2443           20 :   op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
    2444           20 :   x = gen_rtx_AND (vmode, op1, mask);
    2445           20 :   emit_insn (gen_rtx_SET (temp, x));
    2446              : 
    2447           20 :   op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
    2448           20 :   x = gen_rtx_XOR (vmode, temp, op0);
    2449              : 
    2450           20 :   vdest = lowpart_subreg (vmode, dest, mode);
    2451           20 :   if (vdest == NULL_RTX)
    2452            0 :     vdest = gen_reg_rtx (vmode);
    2453              :   else
    2454              :     dest = NULL_RTX;
    2455           20 :   emit_insn (gen_rtx_SET (vdest, x));
    2456              : 
    2457           20 :   if (dest)
    2458            0 :     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2459           20 : }
    2460              : 
    2461              : static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
    2462              : 
    2463              : void
    2464      6622072 : ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
    2465              : {
    2466      6622072 :   machine_mode mode = GET_MODE (op0);
    2467      6622072 :   rtx tmp;
    2468              : 
    2469              :   /* Handle special case - vector comparsion with boolean result, transform
    2470              :      it using ptest instruction or vpcmpeq + kortest.  */
    2471      6622072 :   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
    2472      6602690 :       || (mode == TImode && !TARGET_64BIT)
    2473      6602690 :       || mode == OImode
    2474     13224762 :       || GET_MODE_SIZE (mode) == 64)
    2475              :     {
    2476        19382 :       unsigned msize = GET_MODE_SIZE (mode);
    2477        19382 :       machine_mode p_mode
    2478        19382 :         = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
    2479              :       /* kortest set CF when result is 0xFFFF (op0 == op1).  */
    2480        19382 :       rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
    2481              : 
    2482        19382 :       gcc_assert (code == EQ || code == NE);
    2483              : 
    2484              :       /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
    2485        19382 :       if (msize == 64)
    2486              :         {
    2487         2435 :           if (mode != V16SImode)
    2488              :             {
    2489         2435 :               op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
    2490         2435 :               op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
    2491              :             }
    2492              : 
    2493         2435 :           tmp = gen_reg_rtx (HImode);
    2494         2435 :           emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
    2495         2435 :           emit_insn (gen_kortesthi_ccc (tmp, tmp));
    2496              :         }
    2497              :       /* Using ptest for 128/256-bit vectors.  */
    2498              :       else
    2499              :         {
    2500        16947 :           if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
    2501              :             {
    2502            0 :               op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
    2503            0 :               op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
    2504            0 :               mode = p_mode;
    2505              :             }
    2506              : 
    2507              :           /* Generate XOR since we can't check that one operand is zero
    2508              :              vector.  */
    2509        16947 :           tmp = gen_reg_rtx (mode);
    2510        16947 :           rtx ops[3] = { tmp, op0, op1 };
    2511        16947 :           ix86_expand_vector_logical_operator (XOR, mode, ops);
    2512        16947 :           tmp = gen_lowpart (p_mode, tmp);
    2513        16947 :           emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
    2514              :                                   gen_rtx_UNSPEC (CCZmode,
    2515              :                                                   gen_rtvec (2, tmp, tmp),
    2516              :                                                   UNSPEC_PTEST)));
    2517              :         }
    2518        19382 :       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
    2519        19382 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
    2520              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
    2521              :                                   pc_rtx);
    2522        19382 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    2523        19382 :       return;
    2524              :     }
    2525              : 
    2526      6602690 :   switch (mode)
    2527              :     {
    2528      6571988 :     case E_HFmode:
    2529      6571988 :     case E_SFmode:
    2530      6571988 :     case E_DFmode:
    2531      6571988 :     case E_XFmode:
    2532      6571988 :     case E_QImode:
    2533      6571988 :     case E_HImode:
    2534      6571988 :     case E_SImode:
    2535      6571988 :       simple:
    2536      6571988 :       tmp = ix86_expand_compare (code, op0, op1);
    2537      6571988 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
    2538              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
    2539              :                                   pc_rtx);
    2540      6571988 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    2541      6571988 :       return;
    2542              : 
    2543            7 :     case E_BFmode:
    2544            7 :       gcc_assert (TARGET_AVX10_2 && !flag_trapping_math);
    2545            7 :       goto simple;
    2546              : 
    2547      2661120 :     case E_DImode:
    2548      2661120 :       if (TARGET_64BIT)
    2549      2632427 :         goto simple;
    2550              :       /* FALLTHRU */
    2551        88804 :     case E_TImode:
    2552              :       /* DI and TI mode equality/inequality comparisons may be performed
    2553              :          on SSE registers.  Avoid splitting them, except when optimizing
    2554              :          for size.  */
    2555        88804 :       if ((code == EQ || code == NE)
    2556        88804 :           && !optimize_insn_for_size_p ())
    2557        58102 :         goto simple;
    2558              : 
    2559              :       /* Expand DImode branch into multiple compare+branch.  */
    2560        30702 :       {
    2561        30702 :         rtx lo[2], hi[2];
    2562        30702 :         rtx_code_label *label2;
    2563        30702 :         enum rtx_code code1, code2, code3;
    2564        30702 :         machine_mode submode;
    2565              : 
    2566        30702 :         if (CONSTANT_P (op0) && !CONSTANT_P (op1))
    2567              :           {
    2568            0 :             std::swap (op0, op1);
    2569            0 :             code = swap_condition (code);
    2570              :           }
    2571              : 
    2572        30702 :         split_double_mode (mode, &op0, 1, lo+0, hi+0);
    2573        30702 :         split_double_mode (mode, &op1, 1, lo+1, hi+1);
    2574              : 
    2575        30702 :         submode = mode == DImode ? SImode : DImode;
    2576              : 
    2577              :         /* If we are doing less-than or greater-or-equal-than,
    2578              :            op1 is a constant and the low word is zero, then we can just
    2579              :            examine the high word.  Similarly for low word -1 and
    2580              :            less-or-equal-than or greater-than.  */
    2581              : 
    2582        30702 :         if (CONST_INT_P (hi[1]))
    2583        19772 :           switch (code)
    2584              :             {
    2585        10561 :             case LT: case LTU: case GE: case GEU:
    2586        10561 :               if (lo[1] == const0_rtx)
    2587              :                 {
    2588        10150 :                   ix86_expand_branch (code, hi[0], hi[1], label);
    2589        10150 :                   return;
    2590              :                 }
    2591              :               break;
    2592         7643 :             case LE: case LEU: case GT: case GTU:
    2593         7643 :               if (lo[1] == constm1_rtx)
    2594              :                 {
    2595          524 :                   ix86_expand_branch (code, hi[0], hi[1], label);
    2596          524 :                   return;
    2597              :                 }
    2598              :               break;
    2599              :             default:
    2600              :               break;
    2601              :             }
    2602              : 
    2603              :         /* Emulate comparisons that do not depend on Zero flag with
    2604              :            double-word subtraction.  Note that only Overflow, Sign
    2605              :            and Carry flags are valid, so swap arguments and condition
    2606              :            of comparisons that would otherwise test Zero flag.  */
    2607              : 
    2608        20028 :         switch (code)
    2609              :           {
    2610        12581 :           case LE: case LEU: case GT: case GTU:
    2611        12581 :             std::swap (lo[0], lo[1]);
    2612        12581 :             std::swap (hi[0], hi[1]);
    2613        12581 :             code = swap_condition (code);
    2614              :             /* FALLTHRU */
    2615              : 
    2616        16961 :           case LT: case LTU: case GE: case GEU:
    2617        16961 :             {
    2618        16961 :               bool uns = (code == LTU || code == GEU);
    2619         3981 :               rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
    2620        16961 :                 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
    2621              : 
    2622        16961 :               if (!nonimmediate_operand (lo[0], submode))
    2623         7119 :                 lo[0] = force_reg (submode, lo[0]);
    2624        16961 :               if (!x86_64_general_operand (lo[1], submode))
    2625            0 :                 lo[1] = force_reg (submode, lo[1]);
    2626              : 
    2627        16961 :               if (!register_operand (hi[0], submode))
    2628         7941 :                 hi[0] = force_reg (submode, hi[0]);
    2629        12980 :               if ((uns && !nonimmediate_operand (hi[1], submode))
    2630        16961 :                   || (!uns && !x86_64_general_operand (hi[1], submode)))
    2631          315 :                 hi[1] = force_reg (submode, hi[1]);
    2632              : 
    2633        16961 :               emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
    2634              : 
    2635        16961 :               tmp = gen_rtx_SCRATCH (submode);
    2636        16961 :               emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
    2637              : 
    2638        20942 :               tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
    2639        16961 :               ix86_expand_branch (code, tmp, const0_rtx, label);
    2640        16961 :               return;
    2641              :             }
    2642              : 
    2643         3067 :           default:
    2644         3067 :             break;
    2645              :           }
    2646              : 
    2647              :         /* Otherwise, we need two or three jumps.  */
    2648              : 
    2649         3067 :         label2 = gen_label_rtx ();
    2650              : 
    2651         3067 :         code1 = code;
    2652         3067 :         code2 = swap_condition (code);
    2653         3067 :         code3 = unsigned_condition (code);
    2654              : 
    2655         3067 :         switch (code)
    2656              :           {
    2657              :           case LT: case GT: case LTU: case GTU:
    2658              :             break;
    2659              : 
    2660              :           case LE:   code1 = LT;  code2 = GT;  break;
    2661              :           case GE:   code1 = GT;  code2 = LT;  break;
    2662            0 :           case LEU:  code1 = LTU; code2 = GTU; break;
    2663            0 :           case GEU:  code1 = GTU; code2 = LTU; break;
    2664              : 
    2665              :           case EQ:   code1 = UNKNOWN; code2 = NE;  break;
    2666              :           case NE:   code2 = UNKNOWN; break;
    2667              : 
    2668            0 :           default:
    2669            0 :             gcc_unreachable ();
    2670              :           }
    2671              : 
    2672              :         /*
    2673              :          * a < b =>
    2674              :          *    if (hi(a) < hi(b)) goto true;
    2675              :          *    if (hi(a) > hi(b)) goto false;
    2676              :          *    if (lo(a) < lo(b)) goto true;
    2677              :          *  false:
    2678              :          */
    2679              : 
    2680            0 :         if (code1 != UNKNOWN)
    2681         2399 :           ix86_expand_branch (code1, hi[0], hi[1], label);
    2682         3067 :         if (code2 != UNKNOWN)
    2683          668 :           ix86_expand_branch (code2, hi[0], hi[1], label2);
    2684              : 
    2685         3067 :         ix86_expand_branch (code3, lo[0], lo[1], label);
    2686              : 
    2687         3067 :         if (code2 != UNKNOWN)
    2688          668 :           emit_label (label2);
    2689              :         return;
    2690              :       }
    2691              : 
    2692        17446 :     default:
    2693        17446 :       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
    2694        17446 :       goto simple;
    2695              :     }
    2696              : }
    2697              : 
    2698              : /* Figure out whether to use unordered fp comparisons.  */
    2699              : 
    2700              : static bool
    2701      1149120 : ix86_unordered_fp_compare (enum rtx_code code)
    2702              : {
    2703      1149120 :   if (!TARGET_IEEE_FP)
    2704              :     return false;
    2705              : 
    2706      1144806 :   switch (code)
    2707              :     {
    2708              :     case LT:
    2709              :     case LE:
    2710              :     case GT:
    2711              :     case GE:
    2712              :     case LTGT:
    2713              :       return false;
    2714              : 
    2715              :     case EQ:
    2716              :     case NE:
    2717              : 
    2718              :     case UNORDERED:
    2719              :     case ORDERED:
    2720              :     case UNLT:
    2721              :     case UNLE:
    2722              :     case UNGT:
    2723              :     case UNGE:
    2724              :     case UNEQ:
    2725              :       return true;
    2726              : 
    2727            0 :     default:
    2728            0 :       gcc_unreachable ();
    2729              :     }
    2730              : }
    2731              : 
    2732              : /* Return a comparison we can do and that it is equivalent to
    2733              :    swap_condition (code) apart possibly from orderedness.
    2734              :    But, never change orderedness if TARGET_IEEE_FP, returning
    2735              :    UNKNOWN in that case if necessary.  */
    2736              : 
    2737              : static enum rtx_code
    2738        37559 : ix86_fp_swap_condition (enum rtx_code code)
    2739              : {
    2740        37559 :   switch (code)
    2741              :     {
    2742         1859 :     case GT:                   /* GTU - CF=0 & ZF=0 */
    2743         1859 :       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
    2744          533 :     case GE:                   /* GEU - CF=0 */
    2745          533 :       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
    2746          446 :     case UNLT:                 /* LTU - CF=1 */
    2747          446 :       return TARGET_IEEE_FP ? UNKNOWN : GT;
    2748         6315 :     case UNLE:                 /* LEU - CF=1 | ZF=1 */
    2749         6315 :       return TARGET_IEEE_FP ? UNKNOWN : GE;
    2750        28406 :     default:
    2751        28406 :       return swap_condition (code);
    2752              :     }
    2753              : }
    2754              : 
    2755              : /* Return cost of comparison CODE using the best strategy for performance.
    2756              :    All following functions do use number of instructions as a cost metrics.
    2757              :    In future this should be tweaked to compute bytes for optimize_size and
    2758              :    take into account performance of various instructions on various CPUs.  */
    2759              : 
    2760              : static int
    2761      1147986 : ix86_fp_comparison_cost (enum rtx_code code)
    2762              : {
    2763      1147986 :   int arith_cost;
    2764              : 
    2765              :   /* The cost of code using bit-twiddling on %ah.  */
    2766      1147986 :   switch (code)
    2767              :     {
    2768              :     case UNLE:
    2769              :     case UNLT:
    2770              :     case LTGT:
    2771              :     case GT:
    2772              :     case GE:
    2773              :     case UNORDERED:
    2774              :     case ORDERED:
    2775              :     case UNEQ:
    2776              :       arith_cost = 4;
    2777              :       break;
    2778        85018 :     case LT:
    2779        85018 :     case NE:
    2780        85018 :     case EQ:
    2781        85018 :     case UNGE:
    2782        85018 :       arith_cost = TARGET_IEEE_FP ? 5 : 4;
    2783              :       break;
    2784        24753 :     case LE:
    2785        24753 :     case UNGT:
    2786      1063762 :       arith_cost = TARGET_IEEE_FP ? 6 : 4;
    2787              :       break;
    2788            0 :     default:
    2789            0 :       gcc_unreachable ();
    2790              :     }
    2791              : 
    2792      1147986 :   switch (ix86_fp_comparison_strategy (code))
    2793              :     {
    2794      1147986 :     case IX86_FPCMP_COMI:
    2795      1147986 :       return arith_cost > 4 ? 3 : 2;
    2796            0 :     case IX86_FPCMP_SAHF:
    2797            0 :       return arith_cost > 4 ? 4 : 3;
    2798              :     default:
    2799              :       return arith_cost;
    2800              :     }
    2801              : }
    2802              : 
    2803              : /* Swap, force into registers, or otherwise massage the two operands
    2804              :    to a fp comparison.  The operands are updated in place; the new
    2805              :    comparison code is returned.  */
    2806              : 
    2807              : static enum rtx_code
    2808       573993 : ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
    2809              : {
    2810       574064 :   bool unordered_compare = ix86_unordered_fp_compare (code);
    2811       574064 :   rtx op0 = *pop0, op1 = *pop1;
    2812       574064 :   machine_mode op_mode = GET_MODE (op0);
    2813       574064 :   bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (op_mode);
    2814              : 
    2815       571675 :   if (op_mode == BFmode && (!TARGET_AVX10_2 || flag_trapping_math))
    2816              :     {
    2817           71 :       rtx op = gen_lowpart (HImode, op0);
    2818           71 :       if (CONST_INT_P (op))
    2819            0 :         op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
    2820              :                                              op0, BFmode);
    2821              :       else
    2822              :         {
    2823           71 :           rtx t1 = gen_reg_rtx (SImode);
    2824           71 :           emit_insn (gen_zero_extendhisi2 (t1, op));
    2825           71 :           emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
    2826           71 :           op = gen_lowpart (SFmode, t1);
    2827              :         }
    2828           71 :       *pop0 = op;
    2829           71 :       op = gen_lowpart (HImode, op1);
    2830           71 :       if (CONST_INT_P (op))
    2831            6 :         op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
    2832              :                                              op1, BFmode);
    2833              :       else
    2834              :         {
    2835           65 :           rtx t1 = gen_reg_rtx (SImode);
    2836           65 :           emit_insn (gen_zero_extendhisi2 (t1, op));
    2837           65 :           emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
    2838           65 :           op = gen_lowpart (SFmode, t1);
    2839              :         }
    2840           71 :       *pop1 = op;
    2841           71 :       return ix86_prepare_fp_compare_args (code, pop0, pop1);
    2842              :     }
    2843              : 
    2844              :   /* All of the unordered compare instructions only work on registers.
    2845              :      The same is true of the fcomi compare instructions.  The XFmode
    2846              :      compare instructions require registers except when comparing
    2847              :      against zero or when converting operand 1 from fixed point to
    2848              :      floating point.  */
    2849              : 
    2850       573993 :   if (!is_sse
    2851       573993 :       && (unordered_compare
    2852         8247 :           || (op_mode == XFmode
    2853        10627 :               && ! (standard_80387_constant_p (op0) == 1
    2854         5311 :                     || standard_80387_constant_p (op1) == 1)
    2855         4877 :               && GET_CODE (op1) != FLOAT)
    2856         3370 :           || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
    2857              :     {
    2858       147804 :       op0 = force_reg (op_mode, op0);
    2859       147804 :       op1 = force_reg (op_mode, op1);
    2860              :     }
    2861              :   else
    2862              :     {
    2863              :       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
    2864              :          things around if they appear profitable, otherwise force op0
    2865              :          into a register.  */
    2866              : 
    2867       426189 :       if (standard_80387_constant_p (op0) == 0
    2868       426189 :           || (MEM_P (op0)
    2869        56466 :               && ! (standard_80387_constant_p (op1) == 0
    2870        41169 :                     || MEM_P (op1))))
    2871              :         {
    2872        37559 :           enum rtx_code new_code = ix86_fp_swap_condition (code);
    2873        37559 :           if (new_code != UNKNOWN)
    2874              :             {
    2875              :               std::swap (op0, op1);
    2876       426189 :               code = new_code;
    2877              :             }
    2878              :         }
    2879              : 
    2880       426189 :       if (!REG_P (op0))
    2881        52773 :         op0 = force_reg (op_mode, op0);
    2882              : 
    2883       426189 :       if (CONSTANT_P (op1))
    2884              :         {
    2885       193768 :           int tmp = standard_80387_constant_p (op1);
    2886       193768 :           if (tmp == 0)
    2887        74267 :             op1 = validize_mem (force_const_mem (op_mode, op1));
    2888       119501 :           else if (tmp == 1)
    2889              :             {
    2890        65397 :               if (TARGET_CMOVE)
    2891        65397 :                 op1 = force_reg (op_mode, op1);
    2892              :             }
    2893              :           else
    2894        54104 :             op1 = force_reg (op_mode, op1);
    2895              :         }
    2896              :     }
    2897              : 
    2898              :   /* Try to rearrange the comparison to make it cheaper.  */
    2899       573993 :   if (ix86_fp_comparison_cost (code)
    2900       573993 :       > ix86_fp_comparison_cost (swap_condition (code))
    2901       573993 :       && (REG_P (op1) || can_create_pseudo_p ()))
    2902              :     {
    2903            0 :       std::swap (op0, op1);
    2904            0 :       code = swap_condition (code);
    2905            0 :       if (!REG_P (op0))
    2906            0 :         op0 = force_reg (op_mode, op0);
    2907              :     }
    2908              : 
    2909       573993 :   *pop0 = op0;
    2910       573993 :   *pop1 = op1;
    2911       573993 :   return code;
    2912              : }
    2913              : 
    2914              : /* Generate insn patterns to do a floating point compare of OPERANDS.  */
    2915              : 
    2916              : static rtx
    2917       573993 : ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
    2918              : {
    2919       573993 :   bool unordered_compare = ix86_unordered_fp_compare (code);
    2920       573993 :   machine_mode cmp_mode;
    2921       573993 :   rtx tmp, scratch;
    2922              : 
    2923       573993 :   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
    2924              : 
    2925       573993 :   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
    2926       573993 :   if (unordered_compare)
    2927       498425 :     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
    2928              : 
    2929              :   /* Do fcomi/sahf based test when profitable.  */
    2930       573993 :   switch (ix86_fp_comparison_strategy (code))
    2931              :     {
    2932       573993 :     case IX86_FPCMP_COMI:
    2933       573993 :       tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
    2934              :       /* We only have vcomisbf16, No vcomubf16 nor vcomxbf16 */
    2935       573993 :       if (GET_MODE (op0) != E_BFmode)
    2936              :         {
    2937       573965 :           if (TARGET_AVX10_2 && (code == EQ || code == NE))
    2938          972 :             tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
    2939       573965 :           if (unordered_compare)
    2940       498417 :             tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
    2941              :         }
    2942       573993 :       cmp_mode = CCFPmode;
    2943       573993 :       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
    2944       573993 :       break;
    2945              : 
    2946            0 :     case IX86_FPCMP_SAHF:
    2947            0 :       cmp_mode = CCFPmode;
    2948            0 :       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
    2949            0 :       scratch = gen_reg_rtx (HImode);
    2950            0 :       emit_insn (gen_rtx_SET (scratch, tmp));
    2951            0 :       emit_insn (gen_x86_sahf_1 (scratch));
    2952            0 :       break;
    2953              : 
    2954            0 :     case IX86_FPCMP_ARITH:
    2955            0 :       cmp_mode = CCNOmode;
    2956            0 :       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
    2957            0 :       scratch = gen_reg_rtx (HImode);
    2958            0 :       emit_insn (gen_rtx_SET (scratch, tmp));
    2959              : 
    2960              :       /* In the unordered case, we have to check C2 for NaN's, which
    2961              :          doesn't happen to work out to anything nice combination-wise.
    2962              :          So do some bit twiddling on the value we've got in AH to come
    2963              :          up with an appropriate set of condition codes.  */
    2964              : 
    2965            0 :       switch (code)
    2966              :         {
    2967            0 :         case GT:
    2968            0 :         case UNGT:
    2969            0 :           if (code == GT || !TARGET_IEEE_FP)
    2970              :             {
    2971            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
    2972            0 :               code = EQ;
    2973              :             }
    2974              :           else
    2975              :             {
    2976            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    2977            0 :               emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
    2978            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
    2979            0 :               cmp_mode = CCmode;
    2980            0 :               code = GEU;
    2981              :             }
    2982              :           break;
    2983            0 :         case LT:
    2984            0 :         case UNLT:
    2985            0 :           if (code == LT && TARGET_IEEE_FP)
    2986              :             {
    2987            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    2988            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
    2989            0 :               cmp_mode = CCmode;
    2990            0 :               code = EQ;
    2991              :             }
    2992              :           else
    2993              :             {
    2994            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
    2995            0 :               code = NE;
    2996              :             }
    2997              :           break;
    2998            0 :         case GE:
    2999            0 :         case UNGE:
    3000            0 :           if (code == GE || !TARGET_IEEE_FP)
    3001              :             {
    3002            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
    3003            0 :               code = EQ;
    3004              :             }
    3005              :           else
    3006              :             {
    3007            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3008            0 :               emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
    3009            0 :               code = NE;
    3010              :             }
    3011              :           break;
    3012            0 :         case LE:
    3013            0 :         case UNLE:
    3014            0 :           if (code == LE && TARGET_IEEE_FP)
    3015              :             {
    3016            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3017            0 :               emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
    3018            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
    3019            0 :               cmp_mode = CCmode;
    3020            0 :               code = LTU;
    3021              :             }
    3022              :           else
    3023              :             {
    3024            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
    3025            0 :               code = NE;
    3026              :             }
    3027              :           break;
    3028            0 :         case EQ:
    3029            0 :         case UNEQ:
    3030            0 :           if (code == EQ && TARGET_IEEE_FP)
    3031              :             {
    3032            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3033            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
    3034            0 :               cmp_mode = CCmode;
    3035            0 :               code = EQ;
    3036              :             }
    3037              :           else
    3038              :             {
    3039            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
    3040            0 :               code = NE;
    3041              :             }
    3042              :           break;
    3043            0 :         case NE:
    3044            0 :         case LTGT:
    3045            0 :           if (code == NE && TARGET_IEEE_FP)
    3046              :             {
    3047            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3048            0 :               emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
    3049              :                                              GEN_INT (0x40)));
    3050            0 :               code = NE;
    3051              :             }
    3052              :           else
    3053              :             {
    3054            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
    3055            0 :               code = EQ;
    3056              :             }
    3057              :           break;
    3058              : 
    3059            0 :         case UNORDERED:
    3060            0 :           emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
    3061            0 :           code = NE;
    3062            0 :           break;
    3063            0 :         case ORDERED:
    3064            0 :           emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
    3065            0 :           code = EQ;
    3066            0 :           break;
    3067              : 
    3068            0 :         default:
    3069            0 :           gcc_unreachable ();
    3070              :         }
    3071              :         break;
    3072              : 
    3073            0 :     default:
    3074            0 :       gcc_unreachable();
    3075              :     }
    3076              : 
    3077              :   /* Return the test that should be put into the flags user, i.e.
    3078              :      the bcc, scc, or cmov instruction.  */
    3079       573993 :   return gen_rtx_fmt_ee (code, VOIDmode,
    3080              :                          gen_rtx_REG (cmp_mode, FLAGS_REG),
    3081              :                          const0_rtx);
    3082              : }
    3083              : 
    3084              : /* Generate insn patterns to do an integer compare of OPERANDS.  */
    3085              : 
    3086              : static rtx
    3087      6950446 : ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
    3088              : {
    3089      6950446 :   machine_mode cmpmode;
    3090      6950446 :   rtx tmp, flags;
    3091              : 
    3092              :   /* Swap operands to emit carry flag comparison.  */
    3093      6950446 :   if ((code == GTU || code == LEU)
    3094      6950446 :       && nonimmediate_operand (op1, VOIDmode))
    3095              :     {
    3096       142790 :       std::swap (op0, op1);
    3097       142790 :       code = swap_condition (code);
    3098              :     }
    3099              : 
    3100      6950446 :   cmpmode = SELECT_CC_MODE (code, op0, op1);
    3101      6950446 :   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
    3102              : 
    3103              :   /* Attempt to use PTEST, if available, when testing vector modes for
    3104              :      equality/inequality against zero.  */
    3105      6950446 :   if (op1 == const0_rtx
    3106      2914495 :       && SUBREG_P (op0)
    3107        22876 :       && cmpmode == CCZmode
    3108        10381 :       && SUBREG_BYTE (op0) == 0
    3109         8697 :       && REG_P (SUBREG_REG (op0))
    3110         8697 :       && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
    3111            8 :       && TARGET_SSE4_1
    3112            2 :       && GET_MODE (op0) == TImode
    3113      6950450 :       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
    3114              :     {
    3115            2 :       tmp = SUBREG_REG (op0);
    3116            2 :       if (GET_MODE (tmp) == V8HFmode || GET_MODE (tmp) == V8BFmode)
    3117            1 :         tmp = gen_lowpart (V8HImode, tmp);
    3118            2 :       tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
    3119              :     }
    3120              :   else
    3121      6950444 :     tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
    3122              : 
    3123              :   /* This is very simple, but making the interface the same as in the
    3124              :      FP case makes the rest of the code easier.  */
    3125      6950446 :   emit_insn (gen_rtx_SET (flags, tmp));
    3126              : 
    3127              :   /* Return the test that should be put into the flags user, i.e.
    3128              :      the bcc, scc, or cmov instruction.  */
    3129      6950446 :   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
    3130              : }
    3131              : 
    3132              : static rtx
    3133      7654540 : ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
    3134              : {
    3135      7654540 :   rtx ret;
    3136              : 
    3137      7654540 :   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
    3138       132221 :     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
    3139              : 
    3140      7522319 :   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
    3141              :     {
    3142       571873 :       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
    3143       571873 :       ret = ix86_expand_fp_compare (code, op0, op1);
    3144              :     }
    3145              :   else
    3146      6950446 :     ret = ix86_expand_int_compare (code, op0, op1);
    3147              : 
    3148      7654540 :   return ret;
    3149              : }
    3150              : 
    3151              : void
    3152       587993 : ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
    3153              : {
    3154       587993 :   rtx ret;
    3155              : 
    3156       587993 :   gcc_assert (GET_MODE (dest) == QImode);
    3157              : 
    3158       587993 :   ret = ix86_expand_compare (code, op0, op1);
    3159       587993 :   PUT_MODE (ret, QImode);
    3160       587993 :   emit_insn (gen_rtx_SET (dest, ret));
    3161       587993 : }
    3162              : 
    3163              : /* Expand floating point op0 <=> op1, i.e.
    3164              :    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128.  */
    3165              : 
    3166              : void
    3167          244 : ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
    3168              : {
    3169          244 :   gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
    3170          244 :   rtx zero = NULL_RTX;
    3171          244 :   if (op2 != const0_rtx
    3172           52 :       && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND)
    3173           34 :       && GET_MODE (dest) == SImode)
    3174           34 :     zero = force_reg (SImode, const0_rtx);
    3175          244 :   rtx gt = ix86_expand_fp_compare (GT, op0, op1);
    3176          244 :   rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
    3177          244 :   rtx l1 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
    3178          244 :   rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
    3179          244 :   rtx lend = gen_label_rtx ();
    3180          244 :   rtx tmp;
    3181          244 :   rtx_insn *jmp;
    3182          244 :   if (l2)
    3183              :     {
    3184          207 :       rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
    3185              :                                gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
    3186          207 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
    3187              :                                   gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
    3188          207 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3189          207 :       add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
    3190              :     }
    3191          244 :   if (op2 == const0_rtx)
    3192              :     {
    3193          192 :       rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
    3194              :                                gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
    3195          192 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
    3196              :                                   gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
    3197          192 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3198          192 :       add_reg_br_prob_note (jmp, profile_probability::unlikely ());
    3199          192 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
    3200              :                                   gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
    3201          192 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3202          192 :       add_reg_br_prob_note (jmp, profile_probability::even ());
    3203          192 :       emit_move_insn (dest, constm1_rtx);
    3204          192 :       emit_jump (lend);
    3205          192 :       emit_label (l0);
    3206          192 :       emit_move_insn (dest, const0_rtx);
    3207          192 :       emit_jump (lend);
    3208          192 :       emit_label (l1);
    3209          192 :       emit_move_insn (dest, const1_rtx);
    3210              :     }
    3211              :   else
    3212              :     {
    3213           52 :       rtx lt_tmp = NULL_RTX;
    3214           52 :       if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND)
    3215              :         {
    3216           52 :           lt_tmp = gen_reg_rtx (QImode);
    3217           52 :           ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
    3218              :                              const0_rtx);
    3219           52 :           if (GET_MODE (dest) != QImode)
    3220              :             {
    3221           52 :               tmp = gen_reg_rtx (GET_MODE (dest));
    3222           52 :               emit_insn (gen_rtx_SET (tmp,
    3223              :                                       gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3224              :                                                            lt_tmp)));
    3225           52 :               lt_tmp = tmp;
    3226              :             }
    3227              :         }
    3228           52 :       rtx gt_tmp;
    3229           52 :       if (zero)
    3230              :         {
    3231              :           /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
    3232              :              before the floating point comparison and use setcc_si_slp
    3233              :              pattern to hide it from the combiner, so that it doesn't
    3234              :              undo it.  Similarly for TARGET_ZERO_EXTEND_WITH_AND, where
    3235              :              the ZERO_EXTEND normally emitted would need to be AND
    3236              :              with flags clobber.  */
    3237           34 :           tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
    3238           34 :           PUT_MODE (tmp, QImode);
    3239           34 :           emit_insn (gen_setcc_si_slp (zero, tmp, zero));
    3240           34 :           gt_tmp = zero;
    3241              :         }
    3242              :       else
    3243              :         {
    3244           18 :           gt_tmp = gen_reg_rtx (QImode);
    3245           18 :           ix86_expand_setcc (gt_tmp, GT, XEXP (gt, 0), const0_rtx);
    3246           18 :           if (GET_MODE (dest) != QImode)
    3247              :             {
    3248           18 :               tmp = gen_reg_rtx (GET_MODE (dest));
    3249           18 :               emit_insn (gen_rtx_SET (tmp,
    3250              :                                       gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3251              :                                                            gt_tmp)));
    3252           18 :               gt_tmp = tmp;
    3253              :             }
    3254              :         }
    3255           52 :       if (lt_tmp)
    3256              :         {
    3257           52 :           tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp,
    3258              :                                      dest, 0, OPTAB_DIRECT);
    3259           52 :           if (!rtx_equal_p (tmp, dest))
    3260            0 :             emit_move_insn (dest, tmp);
    3261              :         }
    3262              :       else
    3263              :         {
    3264              :           /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
    3265              :              do ZERO_EXTEND without clobbering flags.  */
    3266            0 :           tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx);
    3267            0 :           PUT_MODE (tmp, SImode);
    3268            0 :           emit_insn (gen_subsi3_carry (dest, gt_tmp,
    3269            0 :                                        force_reg (GET_MODE (dest), const0_rtx),
    3270              :                                        XEXP (gt, 0), tmp));
    3271              :         }
    3272              :     }
    3273          244 :   emit_jump (lend);
    3274          244 :   if (l2)
    3275              :     {
    3276          207 :       emit_label (l2);
    3277          207 :       emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
    3278              :     }
    3279          244 :   emit_label (lend);
    3280          244 : }
    3281              : 
    3282              : /* Expand integral op0 <=> op1, i.e.
    3283              :    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : 1.  */
    3284              : 
    3285              : void
    3286           35 : ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
    3287              : {
    3288           35 :   gcc_assert (INTVAL (op2));
    3289           35 :   rtx zero1 = NULL_RTX, zero2 = NULL_RTX;
    3290           35 :   if (TARGET_ZERO_EXTEND_WITH_AND && GET_MODE (dest) == SImode)
    3291              :     {
    3292            0 :       zero1 = force_reg (SImode, const0_rtx);
    3293            0 :       if (INTVAL (op2) != 1)
    3294            0 :         zero2 = force_reg (SImode, const0_rtx);
    3295              :     }
    3296              : 
    3297              :   /* Not using ix86_expand_int_compare here, so that it doesn't swap
    3298              :      operands nor optimize CC mode - we need a mode usable for both
    3299              :      LT and GT resp. LTU and GTU comparisons with the same unswapped
    3300              :      operands.  */
    3301           51 :   rtx flags = gen_rtx_REG (INTVAL (op2) != 1 ? CCGCmode : CCmode, FLAGS_REG);
    3302           35 :   rtx tmp = gen_rtx_COMPARE (GET_MODE (flags), op0, op1);
    3303           35 :   emit_insn (gen_rtx_SET (flags, tmp));
    3304           35 :   rtx lt_tmp = NULL_RTX;
    3305           35 :   if (zero2)
    3306              :     {
    3307              :       /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
    3308              :          ZERO_EXTEND.  */
    3309            0 :       tmp = ix86_expand_compare (LT, flags, const0_rtx);
    3310            0 :       PUT_MODE (tmp, QImode);
    3311            0 :       emit_insn (gen_setcc_si_slp (zero2, tmp, zero2));
    3312            0 :       lt_tmp = zero2;
    3313              :     }
    3314           35 :   else if (!zero1)
    3315              :     {
    3316           35 :       lt_tmp = gen_reg_rtx (QImode);
    3317           51 :       ix86_expand_setcc (lt_tmp, INTVAL (op2) != 1 ? LT : LTU, flags,
    3318              :                          const0_rtx);
    3319           35 :       if (GET_MODE (dest) != QImode)
    3320              :         {
    3321           35 :           tmp = gen_reg_rtx (GET_MODE (dest));
    3322           35 :           emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3323              :                                                             lt_tmp)));
    3324           35 :           lt_tmp = tmp;
    3325              :         }
    3326              :     }
    3327           35 :   rtx gt_tmp;
    3328           35 :   if (zero1)
    3329              :     {
    3330              :       /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
    3331              :          ZERO_EXTEND.  */
    3332            0 :       tmp = ix86_expand_compare (INTVAL (op2) != 1 ? GT : GTU, flags,
    3333              :                                  const0_rtx);
    3334            0 :       PUT_MODE (tmp, QImode);
    3335            0 :       emit_insn (gen_setcc_si_slp (zero1, tmp, zero1));
    3336            0 :       gt_tmp = zero1;
    3337              :     }
    3338              :   else
    3339              :     {
    3340           35 :       gt_tmp = gen_reg_rtx (QImode);
    3341           51 :       ix86_expand_setcc (gt_tmp, INTVAL (op2) != 1 ? GT : GTU, flags,
    3342              :                          const0_rtx);
    3343           35 :       if (GET_MODE (dest) != QImode)
    3344              :         {
    3345           35 :           tmp = gen_reg_rtx (GET_MODE (dest));
    3346           35 :           emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3347              :                                                             gt_tmp)));
    3348           35 :           gt_tmp = tmp;
    3349              :         }
    3350              :     }
    3351           35 :   if (lt_tmp)
    3352              :     {
    3353           35 :       tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
    3354              :                                  0, OPTAB_DIRECT);
    3355           35 :       if (!rtx_equal_p (tmp, dest))
    3356            0 :         emit_move_insn (dest, tmp);
    3357              :     }
    3358              :   else
    3359              :     {
    3360              :       /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
    3361              :          do ZERO_EXTEND without clobbering flags.  */
    3362            0 :       tmp = ix86_expand_compare (LTU, flags, const0_rtx);
    3363            0 :       PUT_MODE (tmp, SImode);
    3364            0 :       emit_insn (gen_subsi3_carry (dest, gt_tmp,
    3365            0 :                                    force_reg (GET_MODE (dest), const0_rtx),
    3366              :                                    flags, tmp));
    3367              :     }
    3368           35 : }
    3369              : 
    3370              : /* Expand comparison setting or clearing carry flag.  Return true when
    3371              :    successful and set pop for the operation.  */
    3372              : static bool
    3373        29144 : ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
    3374              : {
    3375        58288 :   machine_mode mode
    3376        29144 :     = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
    3377              : 
    3378              :   /* Do not handle double-mode compares that go through special path.  */
    3379        31469 :   if (mode == (TARGET_64BIT ? TImode : DImode))
    3380              :     return false;
    3381              : 
    3382        29134 :   if (SCALAR_FLOAT_MODE_P (mode))
    3383              :     {
    3384         1878 :       rtx compare_op;
    3385         1878 :       rtx_insn *compare_seq;
    3386              : 
    3387         1878 :       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
    3388              : 
    3389              :       /* Shortcut:  following common codes never translate
    3390              :          into carry flag compares.  */
    3391         1878 :       if (code == EQ || code == NE || code == UNEQ || code == LTGT
    3392              :           || code == ORDERED || code == UNORDERED)
    3393              :         return false;
    3394              : 
    3395              :       /* These comparisons require zero flag; swap operands so they won't.  */
    3396              :       if ((code == GT || code == UNLE || code == LE || code == UNGT)
    3397         1813 :           && !TARGET_IEEE_FP)
    3398              :         {
    3399            2 :           std::swap (op0, op1);
    3400            2 :           code = swap_condition (code);
    3401              :         }
    3402              : 
    3403              :       /* Try to expand the comparison and verify that we end up with
    3404              :          carry flag based comparison.  This fails to be true only when
    3405              :          we decide to expand comparison using arithmetic that is not
    3406              :          too common scenario.  */
    3407         1876 :       start_sequence ();
    3408         1876 :       compare_op = ix86_expand_fp_compare (code, op0, op1);
    3409         1876 :       compare_seq = end_sequence ();
    3410              : 
    3411         1876 :       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
    3412         1876 :         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
    3413              :       else
    3414            0 :         code = GET_CODE (compare_op);
    3415              : 
    3416         1876 :       if (code != LTU && code != GEU)
    3417              :         return false;
    3418              : 
    3419           63 :       emit_insn (compare_seq);
    3420           63 :       *pop = compare_op;
    3421           63 :       return true;
    3422              :     }
    3423              : 
    3424        27256 :   if (!INTEGRAL_MODE_P (mode))
    3425              :     return false;
    3426              : 
    3427        27180 :   switch (code)
    3428              :     {
    3429              :     case LTU:
    3430              :     case GEU:
    3431              :       break;
    3432              : 
    3433              :     /* Convert a==0 into (unsigned)a<1.  */
    3434        23652 :     case EQ:
    3435        23652 :     case NE:
    3436        23652 :       if (op1 != const0_rtx)
    3437              :         return false;
    3438        10154 :       op1 = const1_rtx;
    3439        10154 :       code = (code == EQ ? LTU : GEU);
    3440              :       break;
    3441              : 
    3442              :     /* Convert a>b into b<a or a>=b-1.  */
    3443          699 :     case GTU:
    3444          699 :     case LEU:
    3445          699 :       if (CONST_INT_P (op1))
    3446              :         {
    3447          657 :           op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
    3448              :           /* Bail out on overflow.  We still can swap operands but that
    3449              :              would force loading of the constant into register.  */
    3450          657 :           if (op1 == const0_rtx
    3451          657 :               || !x86_64_immediate_operand (op1, GET_MODE (op1)))
    3452            0 :             return false;
    3453          657 :           code = (code == GTU ? GEU : LTU);
    3454              :         }
    3455              :       else
    3456              :         {
    3457           42 :           std::swap (op0, op1);
    3458           42 :           code = (code == GTU ? LTU : GEU);
    3459              :         }
    3460              :       break;
    3461              : 
    3462              :     /* Convert a>=0 into (unsigned)a<0x80000000.  */
    3463         1294 :     case LT:
    3464         1294 :     case GE:
    3465         1294 :       if (mode == DImode || op1 != const0_rtx)
    3466              :         return false;
    3467          204 :       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
    3468          102 :       code = (code == LT ? GEU : LTU);
    3469              :       break;
    3470          842 :     case LE:
    3471          842 :     case GT:
    3472          842 :       if (mode == DImode || op1 != constm1_rtx)
    3473              :         return false;
    3474            0 :       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
    3475            0 :       code = (code == LE ? GEU : LTU);
    3476              :       break;
    3477              : 
    3478              :     default:
    3479              :       return false;
    3480              :     }
    3481              :   /* Swapping operands may cause constant to appear as first operand.  */
    3482        11648 :   if (!nonimmediate_operand (op0, VOIDmode))
    3483              :     {
    3484            0 :       if (!can_create_pseudo_p ())
    3485              :         return false;
    3486            0 :       op0 = force_reg (mode, op0);
    3487              :     }
    3488        11648 :   *pop = ix86_expand_compare (code, op0, op1);
    3489        11648 :   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
    3490              :   return true;
    3491              : }
    3492              : 
    3493              : /* Expand conditional increment or decrement using adb/sbb instructions.
    3494              :    The default case using setcc followed by the conditional move can be
    3495              :    done by generic code.  */
    3496              : bool
    3497         6806 : ix86_expand_int_addcc (rtx operands[])
    3498              : {
    3499         6806 :   enum rtx_code code = GET_CODE (operands[1]);
    3500         6806 :   rtx flags;
    3501         6806 :   rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
    3502         6806 :   rtx compare_op;
    3503         6806 :   rtx val = const0_rtx;
    3504         6806 :   bool fpcmp = false;
    3505         6806 :   machine_mode mode;
    3506         6806 :   rtx op0 = XEXP (operands[1], 0);
    3507         6806 :   rtx op1 = XEXP (operands[1], 1);
    3508              : 
    3509         6806 :   if (operands[3] != const1_rtx
    3510         2835 :       && operands[3] != constm1_rtx)
    3511              :     return false;
    3512         4695 :   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
    3513              :      return false;
    3514         1270 :   code = GET_CODE (compare_op);
    3515              : 
    3516         1270 :   flags = XEXP (compare_op, 0);
    3517              : 
    3518         1270 :   if (GET_MODE (flags) == CCFPmode)
    3519              :     {
    3520            4 :       fpcmp = true;
    3521            4 :       code = ix86_fp_compare_code_to_integer (code);
    3522              :     }
    3523              : 
    3524         1270 :   if (code != LTU)
    3525              :     {
    3526          735 :       val = constm1_rtx;
    3527          735 :       if (fpcmp)
    3528            4 :         PUT_CODE (compare_op,
    3529              :                   reverse_condition_maybe_unordered
    3530              :                     (GET_CODE (compare_op)));
    3531              :       else
    3532          731 :         PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
    3533              :     }
    3534              : 
    3535         1270 :   mode = GET_MODE (operands[0]);
    3536              : 
    3537              :   /* Construct either adc or sbb insn.  */
    3538         1270 :   if ((code == LTU) == (operands[3] == constm1_rtx))
    3539              :     insn = gen_sub3_carry;
    3540              :   else
    3541          515 :     insn = gen_add3_carry;
    3542              : 
    3543         1270 :   emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
    3544              : 
    3545         1270 :   return true;
    3546              : }
    3547              : 
    3548              : bool
    3549       436019 : ix86_expand_int_movcc (rtx operands[])
    3550              : {
    3551       436019 :   enum rtx_code code = GET_CODE (operands[1]), compare_code;
    3552       436019 :   rtx_insn *compare_seq;
    3553       436019 :   rtx compare_op;
    3554       436019 :   machine_mode mode = GET_MODE (operands[0]);
    3555       436019 :   bool sign_bit_compare_p = false;
    3556       436019 :   bool negate_cc_compare_p = false;
    3557       436019 :   rtx op0 = XEXP (operands[1], 0);
    3558       436019 :   rtx op1 = XEXP (operands[1], 1);
    3559       436019 :   rtx op2 = operands[2];
    3560       436019 :   rtx op3 = operands[3];
    3561              : 
    3562       436019 :   if (GET_MODE (op0) == TImode
    3563       420585 :       || (GET_MODE (op0) == DImode
    3564       105600 :           && !TARGET_64BIT))
    3565              :     return false;
    3566              : 
    3567       419489 :   if (GET_MODE (op0) == BFmode
    3568       419489 :       && !ix86_fp_comparison_operator (operands[1], VOIDmode))
    3569              :     return false;
    3570              : 
    3571       419489 :   start_sequence ();
    3572       419489 :   compare_op = ix86_expand_compare (code, op0, op1);
    3573       419489 :   compare_seq = end_sequence ();
    3574              : 
    3575       419489 :   compare_code = GET_CODE (compare_op);
    3576              : 
    3577       419489 :   if ((op1 == const0_rtx && (code == GE || code == LT))
    3578       377561 :       || (op1 == constm1_rtx && (code == GT || code == LE)))
    3579              :     sign_bit_compare_p = true;
    3580              : 
    3581              :   /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
    3582              :      but if op1 is a constant, the latter form allows more optimizations,
    3583              :      either through the last 2 ops being constant handling, or the one
    3584              :      constant and one variable cases.  On the other side, for cmov the
    3585              :      former might be better as we don't need to load the constant into
    3586              :      another register.  */
    3587       377561 :   if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
    3588              :     op2 = op1;
    3589              :   /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1.  */
    3590       418978 :   else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
    3591              :     op3 = op1;
    3592              : 
    3593              :   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
    3594              :      HImode insns, we'd be swallowed in word prefix ops.  */
    3595              : 
    3596         4849 :   if ((mode != HImode || TARGET_FAST_PREFIX)
    3597       449949 :       && (mode != (TARGET_64BIT ? TImode : DImode))
    3598       419489 :       && CONST_INT_P (op2)
    3599       451736 :       && CONST_INT_P (op3))
    3600              :     {
    3601        25253 :       rtx out = operands[0];
    3602        25253 :       HOST_WIDE_INT ct = INTVAL (op2);
    3603        25253 :       HOST_WIDE_INT cf = INTVAL (op3);
    3604        25253 :       HOST_WIDE_INT diff;
    3605              : 
    3606        25253 :       if ((mode == SImode
    3607        11729 :            || (TARGET_64BIT && mode == DImode))
    3608        18297 :           && (GET_MODE (op0) == SImode
    3609        14299 :               || (TARGET_64BIT && GET_MODE (op0) == DImode)))
    3610              :         {
    3611              :           /* Special case x != 0 ? -1 : y.  */
    3612        13136 :           if (code == NE && op1 == const0_rtx && ct == -1)
    3613              :             {
    3614              :               negate_cc_compare_p = true;
    3615              :               std::swap (ct, cf);
    3616              :               code = EQ;
    3617              :             }
    3618        13035 :           else if (code == EQ && op1 == const0_rtx && cf == -1)
    3619        25253 :             negate_cc_compare_p = true;
    3620              :         }
    3621              : 
    3622        25253 :       diff = (unsigned HOST_WIDE_INT) ct - cf;
    3623              :       /* Make sure we can represent the difference between the two values.  */
    3624        25253 :       if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3625       436019 :         return false;
    3626              : 
    3627              :       /*  Sign bit compares are better done using shifts than we do by using
    3628              :           sbb.  */
    3629        25105 :       if (sign_bit_compare_p
    3630        25105 :           || negate_cc_compare_p
    3631        25105 :           || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
    3632              :         {
    3633              :           /* Detect overlap between destination and compare sources.  */
    3634        11097 :           rtx tmp = out;
    3635              : 
    3636        11097 :           if (negate_cc_compare_p)
    3637              :             {
    3638          280 :               if (GET_MODE (op0) == DImode)
    3639          104 :                 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
    3640              :               else
    3641          176 :                 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
    3642          176 :                                               gen_lowpart (SImode, op0)));
    3643              : 
    3644          280 :               tmp = gen_reg_rtx (mode);
    3645          280 :               if (mode == DImode)
    3646          123 :                 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
    3647              :               else
    3648          157 :                 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
    3649              :                                                                   tmp)));
    3650              :             }
    3651        10817 :           else if (!sign_bit_compare_p)
    3652              :             {
    3653        10441 :               rtx flags;
    3654        10441 :               bool fpcmp = false;
    3655              : 
    3656        10441 :               compare_code = GET_CODE (compare_op);
    3657              : 
    3658        10441 :               flags = XEXP (compare_op, 0);
    3659              : 
    3660        10441 :               if (GET_MODE (flags) == CCFPmode)
    3661              :                 {
    3662           59 :                   fpcmp = true;
    3663           59 :                   compare_code
    3664           59 :                     = ix86_fp_compare_code_to_integer (compare_code);
    3665              :                 }
    3666              : 
    3667              :               /* To simplify rest of code, restrict to the GEU case.  */
    3668        10441 :               if (compare_code == LTU)
    3669              :                 {
    3670         6047 :                   std::swap (ct, cf);
    3671         6047 :                   compare_code = reverse_condition (compare_code);
    3672         6047 :                   code = reverse_condition (code);
    3673              :                 }
    3674              :               else
    3675              :                 {
    3676         4394 :                   if (fpcmp)
    3677           59 :                     PUT_CODE (compare_op,
    3678              :                               reverse_condition_maybe_unordered
    3679              :                                 (GET_CODE (compare_op)));
    3680              :                   else
    3681         4335 :                     PUT_CODE (compare_op,
    3682              :                               reverse_condition (GET_CODE (compare_op)));
    3683              :                 }
    3684              : 
    3685        10441 :               diff = (unsigned HOST_WIDE_INT) ct - cf;
    3686              :               /* Make sure we can represent the difference
    3687              :                  between the two values.  */
    3688        10441 :               if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3689              :                 return false;
    3690              : 
    3691        10440 :               if (reg_overlap_mentioned_p (out, compare_op))
    3692            0 :                 tmp = gen_reg_rtx (mode);
    3693              : 
    3694        10440 :               if (mode == DImode)
    3695         2133 :                 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
    3696              :               else
    3697         8307 :                 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
    3698              :                                                  flags, compare_op));
    3699              :             }
    3700              :           else
    3701              :             {
    3702          376 :               if (code == GT || code == GE)
    3703          153 :                 code = reverse_condition (code);
    3704              :               else
    3705              :                 {
    3706          223 :                   std::swap (ct, cf);
    3707              : 
    3708          223 :                   diff = (unsigned HOST_WIDE_INT) ct - cf;
    3709              :                   /* Make sure we can represent the difference
    3710              :                      between the two values.  */
    3711          223 :                   if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3712              :                     return false;
    3713              :                 }
    3714          371 :               tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
    3715              :             }
    3716              : 
    3717        11091 :           if (diff == 1)
    3718              :             {
    3719              :               /*
    3720              :                * cmpl op0,op1
    3721              :                * sbbl dest,dest
    3722              :                * [addl dest, ct]
    3723              :                *
    3724              :                * Size 5 - 8.
    3725              :                */
    3726         1057 :               if (ct)
    3727          881 :                 tmp = expand_simple_binop (mode, PLUS,
    3728              :                                            tmp, GEN_INT (ct),
    3729              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3730              :             }
    3731        10034 :           else if (cf == -1)
    3732              :             {
    3733              :               /*
    3734              :                * cmpl op0,op1
    3735              :                * sbbl dest,dest
    3736              :                * orl $ct, dest
    3737              :                *
    3738              :                * Size 8.
    3739              :                */
    3740          597 :               tmp = expand_simple_binop (mode, IOR,
    3741              :                                          tmp, GEN_INT (ct),
    3742              :                                          copy_rtx (tmp), 1, OPTAB_DIRECT);
    3743              :             }
    3744         9437 :           else if (diff == -1 && ct)
    3745              :             {
    3746              :               /*
    3747              :                * cmpl op0,op1
    3748              :                * sbbl dest,dest
    3749              :                * notl dest
    3750              :                * [addl dest, cf]
    3751              :                *
    3752              :                * Size 8 - 11.
    3753              :                */
    3754          596 :               tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
    3755          596 :               if (cf)
    3756          578 :                 tmp = expand_simple_binop (mode, PLUS,
    3757              :                                            copy_rtx (tmp), GEN_INT (cf),
    3758              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3759              :             }
    3760              :           else
    3761              :             {
    3762              :               /*
    3763              :                * cmpl op0,op1
    3764              :                * sbbl dest,dest
    3765              :                * [notl dest]
    3766              :                * andl cf - ct, dest
    3767              :                * [addl dest, ct]
    3768              :                *
    3769              :                * Size 8 - 11.
    3770              :                */
    3771              : 
    3772         8841 :               if (cf == 0)
    3773              :                 {
    3774          939 :                   cf = ct;
    3775          939 :                   ct = 0;
    3776          939 :                   tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
    3777              :                 }
    3778              : 
    3779         8841 :               HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
    3780              :               /* Make sure we can represent the difference
    3781              :                  between the two values.  */
    3782         8841 :               if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
    3783        16684 :                 return false;
    3784              : 
    3785         8841 :               tmp = expand_simple_binop (mode, AND,
    3786              :                                          copy_rtx (tmp),
    3787         8841 :                                          gen_int_mode (ival, mode),
    3788              :                                          copy_rtx (tmp), 1, OPTAB_DIRECT);
    3789         8841 :               if (ct)
    3790         7073 :                 tmp = expand_simple_binop (mode, PLUS,
    3791              :                                            copy_rtx (tmp), GEN_INT (ct),
    3792              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3793              :             }
    3794              : 
    3795        11091 :           if (!rtx_equal_p (tmp, out))
    3796          474 :             emit_move_insn (copy_rtx (out), copy_rtx (tmp));
    3797              : 
    3798        11091 :           return true;
    3799              :         }
    3800              : 
    3801        14008 :       if (diff < 0)
    3802              :         {
    3803         4776 :           machine_mode cmp_mode = GET_MODE (op0);
    3804         4776 :           enum rtx_code new_code;
    3805              : 
    3806         4776 :           if (SCALAR_FLOAT_MODE_P (cmp_mode))
    3807              :             {
    3808           70 :               gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
    3809              : 
    3810              :               /* We may be reversing a non-trapping
    3811              :                  comparison to a trapping comparison.  */
    3812          136 :                   if (HONOR_NANS (cmp_mode) && flag_trapping_math
    3813           57 :                       && code != EQ && code != NE
    3814          127 :                       && code != ORDERED && code != UNORDERED)
    3815              :                     new_code = UNKNOWN;
    3816              :                   else
    3817           13 :                     new_code = reverse_condition_maybe_unordered (code);
    3818              :             }
    3819              :           else
    3820         4706 :             new_code = ix86_reverse_condition (code, cmp_mode);
    3821         4719 :           if (new_code != UNKNOWN)
    3822              :             {
    3823         4719 :               std::swap (ct, cf);
    3824              : 
    3825         4719 :               diff = (unsigned HOST_WIDE_INT) ct - cf;
    3826              :               /* Make sure we can represent the difference
    3827              :                  between the two values.  */
    3828         4719 :               if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3829              :                 return false;
    3830              : 
    3831              :               code = new_code;
    3832              :             }
    3833              :         }
    3834              : 
    3835        14008 :       compare_code = UNKNOWN;
    3836        14008 :       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
    3837        12263 :           && CONST_INT_P (op1))
    3838              :         {
    3839         6403 :           if (op1 == const0_rtx
    3840          214 :               && (code == LT || code == GE))
    3841              :             compare_code = code;
    3842         6403 :           else if (op1 == constm1_rtx)
    3843              :             {
    3844           90 :               if (code == LE)
    3845              :                 compare_code = LT;
    3846           90 :               else if (code == GT)
    3847              :                 compare_code = GE;
    3848              :             }
    3849              :         }
    3850              : 
    3851              :       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
    3852              :       if (compare_code != UNKNOWN
    3853            0 :           && GET_MODE (op0) == GET_MODE (out)
    3854            0 :           && (cf == -1 || ct == -1))
    3855              :         {
    3856              :           /* If lea code below could be used, only optimize
    3857              :              if it results in a 2 insn sequence.  */
    3858              : 
    3859            0 :           if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
    3860            0 :                  || diff == 3 || diff == 5 || diff == 9)
    3861            0 :               || (compare_code == LT && ct == -1)
    3862            0 :               || (compare_code == GE && cf == -1))
    3863              :             {
    3864              :               /*
    3865              :                * notl op1       (if necessary)
    3866              :                * sarl $31, op1
    3867              :                * orl cf, op1
    3868              :                */
    3869            0 :               if (ct != -1)
    3870              :                 {
    3871            0 :                   cf = ct;
    3872            0 :                   ct = -1;
    3873            0 :                   code = reverse_condition (code);
    3874              :                 }
    3875              : 
    3876            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
    3877              : 
    3878            0 :               out = expand_simple_binop (mode, IOR,
    3879              :                                          out, GEN_INT (cf),
    3880              :                                          out, 1, OPTAB_DIRECT);
    3881            0 :               if (out != operands[0])
    3882            0 :                 emit_move_insn (operands[0], out);
    3883              : 
    3884            0 :               return true;
    3885              :             }
    3886              :         }
    3887              : 
    3888              : 
    3889        20696 :       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
    3890         6688 :            || diff == 3 || diff == 5 || diff == 9)
    3891         7667 :           && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
    3892        21675 :           && (mode != DImode
    3893         1885 :               || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
    3894              :         {
    3895              :           /*
    3896              :            * xorl dest,dest
    3897              :            * cmpl op1,op2
    3898              :            * setcc dest
    3899              :            * lea cf(dest*(ct-cf)),dest
    3900              :            *
    3901              :            * Size 14.
    3902              :            *
    3903              :            * This also catches the degenerate setcc-only case.
    3904              :            */
    3905              : 
    3906         7667 :           rtx tmp;
    3907         7667 :           int nops;
    3908              : 
    3909         7667 :           out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
    3910              : 
    3911         7667 :           nops = 0;
    3912              :           /* On x86_64 the lea instruction operates on Pmode, so we need
    3913              :              to get arithmetics done in proper mode to match.  */
    3914         7667 :           if (diff == 1)
    3915         6495 :             tmp = copy_rtx (out);
    3916              :           else
    3917              :             {
    3918         1172 :               rtx out1;
    3919         1172 :               out1 = copy_rtx (out);
    3920         1172 :               tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
    3921         1172 :               nops++;
    3922         1172 :               if (diff & 1)
    3923              :                 {
    3924          262 :                   tmp = gen_rtx_PLUS (mode, tmp, out1);
    3925          262 :                   nops++;
    3926              :                 }
    3927              :             }
    3928         7667 :           if (cf != 0)
    3929              :             {
    3930         6901 :               tmp = plus_constant (mode, tmp, cf);
    3931         6901 :               nops++;
    3932              :             }
    3933         7667 :           if (!rtx_equal_p (tmp, out))
    3934              :             {
    3935         7139 :               if (nops == 1)
    3936         6063 :                 out = force_operand (tmp, copy_rtx (out));
    3937              :               else
    3938         1076 :                 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
    3939              :             }
    3940         7667 :           if (!rtx_equal_p (out, operands[0]))
    3941          692 :             emit_move_insn (operands[0], copy_rtx (out));
    3942              : 
    3943         7667 :           return true;
    3944              :         }
    3945              : 
    3946              :       /*
    3947              :        * General case:                  Jumpful:
    3948              :        *   xorl dest,dest               cmpl op1, op2
    3949              :        *   cmpl op1, op2                movl ct, dest
    3950              :        *   setcc dest                   jcc 1f
    3951              :        *   decl dest                    movl cf, dest
    3952              :        *   andl (cf-ct),dest            1:
    3953              :        *   addl ct,dest
    3954              :        *
    3955              :        * Size 20.                       Size 14.
    3956              :        *
    3957              :        * This is reasonably steep, but branch mispredict costs are
    3958              :        * high on modern cpus, so consider failing only if optimizing
    3959              :        * for space.
    3960              :        */
    3961              : 
    3962         6341 :       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
    3963         6341 :           && BRANCH_COST (optimize_insn_for_speed_p (),
    3964              :                           false) >= 2)
    3965              :         {
    3966            0 :           if (cf == 0)
    3967              :             {
    3968            0 :               machine_mode cmp_mode = GET_MODE (op0);
    3969            0 :               enum rtx_code new_code;
    3970              : 
    3971            0 :               if (SCALAR_FLOAT_MODE_P (cmp_mode))
    3972              :                 {
    3973            0 :                   gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
    3974              : 
    3975              :                   /* We may be reversing a non-trapping
    3976              :                      comparison to a trapping comparison.  */
    3977            0 :                   if (HONOR_NANS (cmp_mode) && flag_trapping_math
    3978            0 :                       && code != EQ && code != NE
    3979            0 :                       && code != ORDERED && code != UNORDERED)
    3980              :                     new_code = UNKNOWN;
    3981              :                   else
    3982            0 :                     new_code = reverse_condition_maybe_unordered (code);
    3983              : 
    3984              :                 }
    3985              :               else
    3986              :                 {
    3987            0 :                   new_code = ix86_reverse_condition (code, cmp_mode);
    3988            0 :                   if (compare_code != UNKNOWN && new_code != UNKNOWN)
    3989            0 :                     compare_code = reverse_condition (compare_code);
    3990              :                 }
    3991              : 
    3992            0 :               if (new_code != UNKNOWN)
    3993              :                 {
    3994            0 :                   cf = ct;
    3995            0 :                   ct = 0;
    3996            0 :                   code = new_code;
    3997              :                 }
    3998              :             }
    3999              : 
    4000            0 :           if (compare_code != UNKNOWN)
    4001              :             {
    4002              :               /* notl op1       (if needed)
    4003              :                  sarl $31, op1
    4004              :                  andl (cf-ct), op1
    4005              :                  addl ct, op1
    4006              : 
    4007              :                  For x < 0 (resp. x <= -1) there will be no notl,
    4008              :                  so if possible swap the constants to get rid of the
    4009              :                  complement.
    4010              :                  True/false will be -1/0 while code below (store flag
    4011              :                  followed by decrement) is 0/-1, so the constants need
    4012              :                  to be exchanged once more.  */
    4013              : 
    4014            0 :               if (compare_code == GE || !cf)
    4015              :                 {
    4016            0 :                   code = reverse_condition (code);
    4017            0 :                   compare_code = LT;
    4018              :                 }
    4019              :               else
    4020              :                 std::swap (ct, cf);
    4021              : 
    4022            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
    4023              :             }
    4024              :           else
    4025              :             {
    4026            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
    4027              : 
    4028            0 :               out = expand_simple_binop (mode, PLUS, copy_rtx (out),
    4029              :                                          constm1_rtx,
    4030              :                                          copy_rtx (out), 1, OPTAB_DIRECT);
    4031              :             }
    4032              : 
    4033            0 :           HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
    4034              :           /* Make sure we can represent the difference
    4035              :              between the two values.  */
    4036            0 :           if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
    4037              :             return false;
    4038              : 
    4039            0 :           out = expand_simple_binop (mode, AND, copy_rtx (out),
    4040            0 :                                      gen_int_mode (ival, mode),
    4041              :                                      copy_rtx (out), 1, OPTAB_DIRECT);
    4042            0 :           if (ct)
    4043            0 :             out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
    4044              :                                        copy_rtx (out), 1, OPTAB_DIRECT);
    4045            0 :           if (!rtx_equal_p (out, operands[0]))
    4046            0 :             emit_move_insn (operands[0], copy_rtx (out));
    4047              : 
    4048            0 :           return true;
    4049              :         }
    4050              :     }
    4051              : 
    4052       400577 :   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
    4053              :     {
    4054              :       /* Try a few things more with specific constants and a variable.  */
    4055              : 
    4056            0 :       optab op;
    4057            0 :       rtx var, orig_out, out, tmp;
    4058              : 
    4059            0 :       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
    4060              :         return false;
    4061              : 
    4062            0 :       operands[2] = op2;
    4063            0 :       operands[3] = op3;
    4064              : 
    4065              :       /* If one of the two operands is an interesting constant, load a
    4066              :          constant with the above and mask it in with a logical operation.  */
    4067              : 
    4068            0 :       if (CONST_INT_P (operands[2]))
    4069              :         {
    4070            0 :           var = operands[3];
    4071            0 :           if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
    4072            0 :             operands[3] = constm1_rtx, op = and_optab;
    4073            0 :           else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
    4074            0 :             operands[3] = const0_rtx, op = ior_optab;
    4075              :           else
    4076              :             return false;
    4077              :         }
    4078            0 :       else if (CONST_INT_P (operands[3]))
    4079              :         {
    4080            0 :           var = operands[2];
    4081            0 :           if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
    4082              :             {
    4083              :               /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
    4084              :                  "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
    4085            0 :               if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
    4086            0 :                 operands[1] = simplify_gen_relational (LT, VOIDmode,
    4087            0 :                                                        GET_MODE (op0),
    4088              :                                                        op0, const0_rtx);
    4089              : 
    4090            0 :               operands[2] = constm1_rtx;
    4091            0 :               op = and_optab;
    4092              :             }
    4093            0 :           else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
    4094            0 :             operands[2] = const0_rtx, op = ior_optab;
    4095              :           else
    4096              :             return false;
    4097              :         }
    4098              :       else
    4099              :         return false;
    4100              : 
    4101            0 :       orig_out = operands[0];
    4102            0 :       tmp = gen_reg_rtx (mode);
    4103            0 :       operands[0] = tmp;
    4104              : 
    4105              :       /* Recurse to get the constant loaded.  */
    4106            0 :       if (!ix86_expand_int_movcc (operands))
    4107              :         return false;
    4108              : 
    4109              :       /* Mask in the interesting variable.  */
    4110            0 :       out = expand_binop (mode, op, var, tmp, orig_out, 0,
    4111              :                           OPTAB_WIDEN);
    4112            0 :       if (!rtx_equal_p (out, orig_out))
    4113            0 :         emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
    4114              : 
    4115            0 :       return true;
    4116              :     }
    4117              : 
    4118              :   /*
    4119              :    * For comparison with above,
    4120              :    *
    4121              :    * movl cf,dest
    4122              :    * movl ct,tmp
    4123              :    * cmpl op1,op2
    4124              :    * cmovcc tmp,dest
    4125              :    *
    4126              :    * Size 15.
    4127              :    */
    4128              : 
    4129       400577 :   if (! nonimmediate_operand (operands[2], mode))
    4130        22529 :     operands[2] = force_reg (mode, operands[2]);
    4131       400577 :   if (! nonimmediate_operand (operands[3], mode))
    4132       171999 :     operands[3] = force_reg (mode, operands[3]);
    4133              : 
    4134       400577 :   if (! register_operand (operands[2], VOIDmode)
    4135       400577 :       && (mode == QImode
    4136         1093 :           || ! register_operand (operands[3], VOIDmode)))
    4137         1564 :     operands[2] = force_reg (mode, operands[2]);
    4138              : 
    4139       400577 :   if (mode == QImode
    4140       400577 :       && ! register_operand (operands[3], VOIDmode))
    4141          592 :     operands[3] = force_reg (mode, operands[3]);
    4142              : 
    4143       400577 :   emit_insn (compare_seq);
    4144       400577 :   emit_insn (gen_rtx_SET (operands[0],
    4145              :                           gen_rtx_IF_THEN_ELSE (mode,
    4146              :                                                 compare_op, operands[2],
    4147              :                                                 operands[3])));
    4148       400577 :   return true;
    4149              : }
    4150              : 
    4151              : /* Detect conditional moves that exactly match min/max operational
    4152              :    semantics.  Note that this is IEEE safe, as long as we don't
    4153              :    interchange the operands.
    4154              : 
    4155              :    Returns FALSE if this conditional move doesn't match a MIN/MAX,
    4156              :    and TRUE if the operation is successful and instructions are emitted.  */
    4157              : 
    4158              : static bool
    4159         9781 : ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
    4160              :                            rtx cmp_op1, rtx if_true, rtx if_false)
    4161              : {
    4162         9781 :   machine_mode mode = GET_MODE (dest);
    4163         9781 :   bool is_min;
    4164         9781 :   rtx tmp;
    4165              : 
    4166         9781 :   if (code == LT)
    4167              :     ;
    4168         3250 :   else if (code == LE && !HONOR_NANS (mode))
    4169              :     {
    4170              :       /* We can swap LE to GE and then invert to LT.  */
    4171              :       std::swap (cmp_op0, cmp_op1);
    4172              :       std::swap (if_true, if_false);
    4173              :     }
    4174         3209 :   else if (code == UNGE)
    4175              :     std::swap (if_true, if_false);
    4176              :   else
    4177              :     return false;
    4178              : 
    4179         8679 :   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
    4180              :     is_min = true;
    4181         4627 :   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
    4182              :     is_min = false;
    4183              :   else
    4184         1045 :     return false;
    4185              : 
    4186         7634 :   if (immediate_operand (if_false, mode))
    4187            8 :     if_false = force_reg (mode, if_false);
    4188         7634 :   if (immediate_operand (if_true, mode))
    4189            0 :     if_true = force_reg (mode, if_true);
    4190              : 
    4191              :   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
    4192              :      but MODE may be a vector mode and thus not appropriate.  */
    4193         7634 :   if (!flag_finite_math_only || flag_signed_zeros)
    4194              :     {
    4195         7634 :       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
    4196         7634 :       rtvec v;
    4197              : 
    4198         7634 :       if_true = force_reg (mode, if_true);
    4199         7634 :       v = gen_rtvec (2, if_true, if_false);
    4200         7634 :       tmp = gen_rtx_UNSPEC (mode, v, u);
    4201         7634 :     }
    4202              :   else
    4203              :     {
    4204            0 :       code = is_min ? SMIN : SMAX;
    4205            0 :       if (MEM_P (if_true) && MEM_P (if_false))
    4206            0 :         if_true = force_reg (mode, if_true);
    4207            0 :       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
    4208              :     }
    4209              : 
    4210         7634 :   emit_insn (gen_rtx_SET (dest, tmp));
    4211         7634 :   return true;
    4212              : }
    4213              : 
    4214              : /* Return true if MODE is valid for vector compare to mask register,
    4215              :    Same result for conditionl vector move with mask register.  */
    4216              : static bool
    4217        14930 : ix86_valid_mask_cmp_mode (machine_mode mode)
    4218              : {
    4219              :   /* XOP has its own vector conditional movement.  */
    4220        14930 :   if (TARGET_XOP && !TARGET_AVX512F)
    4221              :     return false;
    4222              : 
    4223              :   /* HFmode only supports vcmpsh whose dest is mask register.  */
    4224        14924 :   if (TARGET_AVX512FP16 && mode == HFmode)
    4225              :     return true;
    4226              : 
    4227              :   /* AVX512F is needed for mask operation.  */
    4228        14832 :   if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
    4229              :     return false;
    4230              : 
    4231              :   /* AVX512BW is needed for vector QI/HImode,
    4232              :      AVX512VL is needed for 128/256-bit vector.  */
    4233          182 :   machine_mode inner_mode = GET_MODE_INNER (mode);
    4234          182 :   int vector_size = GET_MODE_SIZE (mode);
    4235          182 :   if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
    4236              :     return false;
    4237              : 
    4238          162 :   return vector_size == 64 || TARGET_AVX512VL;
    4239              : }
    4240              : 
    4241              : /* Return true if integer mask comparison should be used.  */
    4242              : static bool
    4243        52491 : ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
    4244              :                      rtx op_true, rtx op_false)
    4245              : {
    4246        52491 :   int vector_size = GET_MODE_SIZE (mode);
    4247              : 
    4248        52491 :   if (cmp_mode == HFmode)
    4249              :     return true;
    4250        52399 :   else if (vector_size < 16)
    4251              :     return false;
    4252        46155 :   else if (vector_size == 64)
    4253              :     return true;
    4254        92194 :   else if (GET_MODE_INNER (cmp_mode) == HFmode)
    4255              :     return true;
    4256        92194 :   else if (GET_MODE_INNER (cmp_mode) == BFmode)
    4257              :     return true;
    4258              : 
    4259              :   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
    4260        46097 :   gcc_assert (!op_true == !op_false);
    4261              : 
    4262              :   /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
    4263              :      vector dest is required.  */
    4264        46097 :   if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
    4265              :     return false;
    4266              : 
    4267              :   /* Exclude those that could be optimized in ix86_expand_sse_movcc.  */
    4268           48 :   if (op_false == CONST0_RTX (mode)
    4269           48 :       || op_true == CONST0_RTX (mode)
    4270           48 :       || (INTEGRAL_MODE_P (mode)
    4271           40 :           && (op_true == CONSTM1_RTX (mode)
    4272           40 :               || op_false == CONSTM1_RTX (mode))))
    4273            0 :     return false;
    4274              : 
    4275              :   return true;
    4276              : }
    4277              : 
    4278              : /* Expand an SSE comparison.  Return the register with the result.  */
    4279              : 
    4280              : static rtx
    4281        35534 : ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
    4282              :                      rtx op_true, rtx op_false)
    4283              : {
    4284        35534 :   machine_mode mode = GET_MODE (dest);
    4285        35534 :   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
    4286              : 
    4287              :   /* In general case result of comparison can differ from operands' type.  */
    4288        35534 :   machine_mode cmp_mode;
    4289              : 
    4290              :   /* In AVX512F the result of comparison is an integer mask.  */
    4291        35534 :   bool maskcmp = false;
    4292        35534 :   rtx x;
    4293              : 
    4294        35534 :   if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
    4295              :     {
    4296          145 :       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
    4297          145 :       maskcmp = true;
    4298          145 :       cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
    4299              :     }
    4300              :   else
    4301              :     cmp_mode = cmp_ops_mode;
    4302              : 
    4303        35534 :   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
    4304              : 
    4305        71068 :   bool (*op1_predicate)(rtx, machine_mode)
    4306        35534 :     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
    4307              : 
    4308        35534 :   if (!op1_predicate (cmp_op1, cmp_ops_mode))
    4309            0 :     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
    4310              : 
    4311        35534 :   if (optimize
    4312          506 :       || (maskcmp && cmp_mode != mode)
    4313          506 :       || (op_true && reg_overlap_mentioned_p (dest, op_true))
    4314        36040 :       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
    4315        69911 :     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
    4316              : 
    4317        35534 :   if (maskcmp)
    4318              :     {
    4319          145 :       bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
    4320          145 :       gcc_assert (ok);
    4321              :       return dest;
    4322              :     }
    4323              : 
    4324        35389 :   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
    4325              : 
    4326        35389 :   if (cmp_mode != mode)
    4327              :     {
    4328         7215 :       x = force_reg (cmp_ops_mode, x);
    4329         7215 :       convert_move (dest, x, false);
    4330              :     }
    4331              :   else
    4332        28174 :     emit_insn (gen_rtx_SET (dest, x));
    4333              : 
    4334              :   return dest;
    4335              : }
    4336              : 
    4337              : /* Emit x86 binary operand CODE in mode MODE for SSE vector
    4338              :    instructions that can be performed using GP registers.  */
    4339              : 
    4340              : static void
    4341         7066 : ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
    4342              :                      rtx dst, rtx src1, rtx src2)
    4343              : {
    4344         7066 :   rtx tmp;
    4345              : 
    4346         7066 :   tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
    4347              : 
    4348         7066 :   if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
    4349         7066 :       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
    4350              :     {
    4351          102 :       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    4352          102 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
    4353              :     }
    4354              : 
    4355         7066 :   emit_insn (tmp);
    4356         7066 : }
    4357              : 
    4358              : /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
    4359              :    operations.  This is used for both scalar and vector conditional moves.  */
    4360              : 
    4361              : void
    4362        10163 : ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
    4363              : {
    4364        10163 :   machine_mode mode = GET_MODE (dest);
    4365        10163 :   machine_mode cmpmode = GET_MODE (cmp);
    4366        10163 :   rtx x;
    4367              : 
    4368              :   /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
    4369        10163 :   if (rtx_equal_p (op_true, op_false))
    4370              :     {
    4371            0 :       emit_move_insn (dest, op_true);
    4372            0 :       return;
    4373              :     }
    4374              : 
    4375              :   /* If we have an integer mask and FP value then we need
    4376              :      to cast mask to FP mode.  */
    4377        10163 :   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
    4378              :     {
    4379         1545 :       cmp = force_reg (cmpmode, cmp);
    4380         1545 :       cmp = gen_rtx_SUBREG (mode, cmp, 0);
    4381              :     }
    4382              : 
    4383              :   /* In AVX512F the result of comparison is an integer mask.  */
    4384        10163 :   if (mode != cmpmode
    4385         1690 :       && GET_MODE_CLASS (cmpmode) == MODE_INT)
    4386              :     {
    4387          145 :       gcc_assert (ix86_valid_mask_cmp_mode (mode));
    4388              :       /* Using scalar/vector move with mask register.  */
    4389          145 :       cmp = force_reg (cmpmode, cmp);
    4390              :       /* Optimize for mask zero.  */
    4391          290 :       op_true = (op_true != CONST0_RTX (mode)
    4392          145 :                  ? force_reg (mode, op_true) : op_true);
    4393          290 :       op_false = (op_false != CONST0_RTX (mode)
    4394          145 :                   ? force_reg (mode, op_false) : op_false);
    4395          145 :       if (op_true == CONST0_RTX (mode))
    4396              :         {
    4397            0 :           if (cmpmode == E_DImode && !TARGET_64BIT)
    4398              :             {
    4399            0 :               x = gen_reg_rtx (cmpmode);
    4400            0 :               emit_insn (gen_knotdi (x, cmp));
    4401              :             }
    4402              :           else
    4403            0 :             x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
    4404              :           cmp = x;
    4405              :           /* Reverse op_true op_false.  */
    4406              :           std::swap (op_true, op_false);
    4407              :         }
    4408              : 
    4409          145 :       if (mode == HFmode)
    4410           92 :         emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
    4411              :       else
    4412           53 :         emit_insn (gen_rtx_SET (dest,
    4413              :                                 gen_rtx_VEC_MERGE (mode,
    4414              :                                                    op_true, op_false, cmp)));
    4415          145 :       return;
    4416              :     }
    4417              : 
    4418        10018 :   if (vector_all_ones_operand (op_true, mode)
    4419        10018 :       && op_false == CONST0_RTX (mode))
    4420              :     {
    4421            2 :       emit_move_insn (dest, cmp);
    4422            2 :       return;
    4423              :     }
    4424        10016 :   else if (op_false == CONST0_RTX (mode))
    4425              :     {
    4426          903 :       x = expand_simple_binop (mode, AND, cmp, op_true,
    4427              :                                dest, 1, OPTAB_DIRECT);
    4428          903 :       if (x != dest)
    4429            0 :         emit_move_insn (dest, x);
    4430          903 :       return;
    4431              :     }
    4432         9113 :   else if (op_true == CONST0_RTX (mode))
    4433              :     {
    4434          116 :       op_false = force_reg (mode, op_false);
    4435          116 :       x = gen_rtx_NOT (mode, cmp);
    4436          116 :       ix86_emit_vec_binop (AND, mode, dest, x, op_false);
    4437          116 :       return;
    4438              :     }
    4439         8997 :   else if (vector_all_ones_operand (op_true, mode))
    4440              :     {
    4441            2 :       x = expand_simple_binop (mode, IOR, cmp, op_false,
    4442              :                                dest, 1, OPTAB_DIRECT);
    4443            2 :       if (x != dest)
    4444            0 :         emit_move_insn (dest, x);
    4445            2 :       return;
    4446              :     }
    4447              : 
    4448         8995 :   if (TARGET_XOP)
    4449              :     {
    4450           65 :       op_true = force_reg (mode, op_true);
    4451              : 
    4452           65 :       if (GET_MODE_SIZE (mode) < 16
    4453           65 :           || !nonimmediate_operand (op_false, mode))
    4454           49 :         op_false = force_reg (mode, op_false);
    4455              : 
    4456           65 :       emit_insn (gen_rtx_SET (dest,
    4457              :                               gen_rtx_IF_THEN_ELSE (mode, cmp,
    4458              :                                                     op_true, op_false)));
    4459           65 :       return;
    4460              :     }
    4461              : 
    4462         8930 :   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
    4463         8930 :   machine_mode blend_mode = mode;
    4464              : 
    4465         8930 :   if (GET_MODE_SIZE (mode) < 16
    4466         8930 :       || !vector_operand (op_true, mode))
    4467         2297 :     op_true = force_reg (mode, op_true);
    4468              : 
    4469         8930 :   op_false = force_reg (mode, op_false);
    4470              : 
    4471         8930 :   switch (mode)
    4472              :     {
    4473           29 :     case E_V2SFmode:
    4474           29 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4475              :         gen = gen_mmx_blendvps;
    4476              :       break;
    4477          320 :     case E_V4SFmode:
    4478          320 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4479              :         gen = gen_sse4_1_blendvps;
    4480              :       break;
    4481          157 :     case E_V2DFmode:
    4482          157 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4483              :         gen = gen_sse4_1_blendvpd;
    4484              :       break;
    4485         1097 :     case E_SFmode:
    4486         1097 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4487              :         gen = gen_sse4_1_blendvss;
    4488              :       break;
    4489          818 :     case E_DFmode:
    4490          818 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4491              :         gen = gen_sse4_1_blendvsd;
    4492              :       break;
    4493          222 :     case E_V8QImode:
    4494          222 :     case E_V4HImode:
    4495          222 :     case E_V4HFmode:
    4496          222 :     case E_V4BFmode:
    4497          222 :     case E_V2SImode:
    4498          222 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4499              :         {
    4500              :           gen = gen_mmx_pblendvb_v8qi;
    4501              :           blend_mode = V8QImode;
    4502              :         }
    4503              :       break;
    4504           95 :     case E_V4QImode:
    4505           95 :     case E_V2HImode:
    4506           95 :     case E_V2HFmode:
    4507           95 :     case E_V2BFmode:
    4508           95 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4509              :         {
    4510              :           gen = gen_mmx_pblendvb_v4qi;
    4511              :           blend_mode = V4QImode;
    4512              :         }
    4513              :       break;
    4514           36 :     case E_V2QImode:
    4515           36 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4516              :         gen = gen_mmx_pblendvb_v2qi;
    4517              :       break;
    4518         5497 :     case E_V16QImode:
    4519         5497 :     case E_V8HImode:
    4520         5497 :     case E_V8HFmode:
    4521         5497 :     case E_V8BFmode:
    4522         5497 :     case E_V4SImode:
    4523         5497 :     case E_V2DImode:
    4524         5497 :     case E_V1TImode:
    4525         5497 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4526              :         {
    4527              :           gen = gen_sse4_1_pblendvb;
    4528              :           blend_mode = V16QImode;
    4529              :         }
    4530              :       break;
    4531           99 :     case E_V8SFmode:
    4532           99 :       if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
    4533              :         gen = gen_avx_blendvps256;
    4534              :       break;
    4535          192 :     case E_V4DFmode:
    4536          192 :       if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
    4537              :         gen = gen_avx_blendvpd256;
    4538              :       break;
    4539          368 :     case E_V32QImode:
    4540          368 :     case E_V16HImode:
    4541          368 :     case E_V16HFmode:
    4542          368 :     case E_V16BFmode:
    4543          368 :     case E_V8SImode:
    4544          368 :     case E_V4DImode:
    4545          368 :       if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
    4546              :         {
    4547              :           gen = gen_avx2_pblendvb;
    4548              :           blend_mode = V32QImode;
    4549              :         }
    4550              :       break;
    4551              : 
    4552            0 :     case E_V64QImode:
    4553            0 :       gen = gen_avx512bw_blendmv64qi;
    4554            0 :       break;
    4555            0 :     case E_V32HImode:
    4556            0 :       gen = gen_avx512bw_blendmv32hi;
    4557            0 :       break;
    4558            0 :     case E_V32HFmode:
    4559            0 :       gen = gen_avx512bw_blendmv32hf;
    4560            0 :       break;
    4561            0 :     case E_V32BFmode:
    4562            0 :       gen = gen_avx512bw_blendmv32bf;
    4563            0 :       break;
    4564            0 :     case E_V16SImode:
    4565            0 :       gen = gen_avx512f_blendmv16si;
    4566            0 :       break;
    4567            0 :     case E_V8DImode:
    4568            0 :       gen = gen_avx512f_blendmv8di;
    4569            0 :       break;
    4570            0 :     case E_V8DFmode:
    4571            0 :       gen = gen_avx512f_blendmv8df;
    4572            0 :       break;
    4573              :     case E_V16SFmode:
    4574              :       gen = gen_avx512f_blendmv16sf;
    4575              :       break;
    4576              : 
    4577              :     default:
    4578              :       break;
    4579              :     }
    4580              : 
    4581            0 :   if (gen != NULL)
    4582              :     {
    4583         2081 :       if (blend_mode == mode)
    4584              :         x = dest;
    4585              :       else
    4586              :         {
    4587         1016 :           x = gen_reg_rtx (blend_mode);
    4588         1016 :           op_false = gen_lowpart (blend_mode, op_false);
    4589         1016 :           op_true = gen_lowpart (blend_mode, op_true);
    4590         1016 :           cmp = gen_lowpart (blend_mode, cmp);
    4591              :         }
    4592              : 
    4593         2081 :       emit_insn (gen (x, op_false, op_true, cmp));
    4594              : 
    4595         2081 :       if (x != dest)
    4596         1016 :         emit_move_insn (dest, gen_lowpart (mode, x));
    4597              :     }
    4598              :   else
    4599              :     {
    4600         6849 :       rtx t2, t3;
    4601              : 
    4602         6849 :       t2 = expand_simple_binop (mode, AND, op_true, cmp,
    4603              :                                 NULL, 1, OPTAB_DIRECT);
    4604              : 
    4605         6849 :       t3 = gen_reg_rtx (mode);
    4606         6849 :       x = gen_rtx_NOT (mode, cmp);
    4607         6849 :       ix86_emit_vec_binop (AND, mode, t3, x, op_false);
    4608              : 
    4609         6849 :       x = expand_simple_binop (mode, IOR, t3, t2,
    4610              :                                dest, 1, OPTAB_DIRECT);
    4611         6849 :       if (x != dest)
    4612            0 :         emit_move_insn (dest, x);
    4613              :     }
    4614              : }
    4615              : 
    4616              : /* Swap, force into registers, or otherwise massage the two operands
    4617              :    to an sse comparison with a mask result.  Thus we differ a bit from
    4618              :    ix86_prepare_fp_compare_args which expects to produce a flags result.
    4619              : 
    4620              :    The DEST operand exists to help determine whether to commute commutative
    4621              :    operators.  The POP0/POP1 operands are updated in place.  The new
    4622              :    comparison code is returned, or UNKNOWN if not implementable.  */
    4623              : 
    4624              : static enum rtx_code
    4625        16996 : ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
    4626              :                                   rtx *pop0, rtx *pop1)
    4627              : {
    4628        16996 :   switch (code)
    4629              :     {
    4630           67 :     case LTGT:
    4631           67 :     case UNEQ:
    4632              :       /* AVX supports all the needed comparisons.  */
    4633           67 :       if (TARGET_AVX)
    4634              :         break;
    4635              :       /* We have no LTGT as an operator.  We could implement it with
    4636              :          NE & ORDERED, but this requires an extra temporary.  It's
    4637              :          not clear that it's worth it.  */
    4638              :       return UNKNOWN;
    4639              : 
    4640              :     case LT:
    4641              :     case LE:
    4642              :     case UNGT:
    4643              :     case UNGE:
    4644              :       /* These are supported directly.  */
    4645              :       break;
    4646              : 
    4647         5365 :     case EQ:
    4648         5365 :     case NE:
    4649         5365 :     case UNORDERED:
    4650         5365 :     case ORDERED:
    4651              :       /* AVX has 3 operand comparisons, no need to swap anything.  */
    4652         5365 :       if (TARGET_AVX)
    4653              :         break;
    4654              :       /* For commutative operators, try to canonicalize the destination
    4655              :          operand to be first in the comparison - this helps reload to
    4656              :          avoid extra moves.  */
    4657          790 :       if (!dest || !rtx_equal_p (dest, *pop1))
    4658              :         break;
    4659              :       /* FALLTHRU */
    4660              : 
    4661        10581 :     case GE:
    4662        10581 :     case GT:
    4663        10581 :     case UNLE:
    4664        10581 :     case UNLT:
    4665              :       /* These are not supported directly before AVX, and furthermore
    4666              :          ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
    4667              :          comparison operands to transform into something that is
    4668              :          supported.  */
    4669        10581 :       std::swap (*pop0, *pop1);
    4670        10581 :       code = swap_condition (code);
    4671        10581 :       break;
    4672              : 
    4673            0 :     default:
    4674            0 :       gcc_unreachable ();
    4675              :     }
    4676              : 
    4677              :   return code;
    4678              : }
    4679              : 
    4680              : /* Expand a floating-point conditional move.  Return true if successful.  */
    4681              : 
    4682              : bool
    4683        95697 : ix86_expand_fp_movcc (rtx operands[])
    4684              : {
    4685        95697 :   machine_mode mode = GET_MODE (operands[0]);
    4686        95697 :   enum rtx_code code = GET_CODE (operands[1]);
    4687        95697 :   rtx tmp, compare_op;
    4688        95697 :   rtx op0 = XEXP (operands[1], 0);
    4689        95697 :   rtx op1 = XEXP (operands[1], 1);
    4690              : 
    4691        95697 :   if (GET_MODE (op0) == BFmode
    4692        95697 :       && !ix86_fp_comparison_operator (operands[1], VOIDmode))
    4693              :     return false;
    4694              : 
    4695        95697 :   if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
    4696              :     {
    4697        65189 :       machine_mode cmode;
    4698              : 
    4699              :       /* Since we've no cmove for sse registers, don't force bad register
    4700              :          allocation just to gain access to it.  Deny movcc when the
    4701              :          comparison mode doesn't match the move mode.  */
    4702        65189 :       cmode = GET_MODE (op0);
    4703        65189 :       if (cmode == VOIDmode)
    4704            0 :         cmode = GET_MODE (op1);
    4705        65189 :       if (cmode != mode)
    4706              :         return false;
    4707              : 
    4708         9801 :       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
    4709         9801 :       if (code == UNKNOWN)
    4710              :         return false;
    4711              : 
    4712         9781 :       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
    4713              :                                      operands[2], operands[3]))
    4714              :         return true;
    4715              : 
    4716         2147 :       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
    4717              :                                  operands[2], operands[3]);
    4718         2147 :       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
    4719         2147 :       return true;
    4720              :     }
    4721              : 
    4722        30508 :   if (GET_MODE (op0) == TImode
    4723        30508 :       || (GET_MODE (op0) == DImode
    4724           72 :           && !TARGET_64BIT))
    4725              :     return false;
    4726              : 
    4727              :   /* The floating point conditional move instructions don't directly
    4728              :      support conditions resulting from a signed integer comparison.  */
    4729              : 
    4730        30436 :   compare_op = ix86_expand_compare (code, op0, op1);
    4731        30436 :   if (!fcmov_comparison_operator (compare_op, VOIDmode))
    4732              :     {
    4733          146 :       tmp = gen_reg_rtx (QImode);
    4734          146 :       ix86_expand_setcc (tmp, code, op0, op1);
    4735              : 
    4736          146 :       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
    4737              :     }
    4738              : 
    4739        30436 :   operands[2] = force_reg (mode, operands[2]);
    4740        30436 :   operands[3] = force_reg (mode, operands[3]);
    4741        30436 :   emit_insn (gen_rtx_SET (operands[0],
    4742              :                           gen_rtx_IF_THEN_ELSE (mode, compare_op,
    4743              :                                                 operands[2], operands[3])));
    4744              : 
    4745        30436 :   return true;
    4746              : }
    4747              : 
    4748              : /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
    4749              : 
    4750              : static int
    4751         4885 : ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
    4752              : {
    4753         4885 :   switch (code)
    4754              :     {
    4755              :     case EQ:
    4756              :       return 0;
    4757          379 :     case LT:
    4758          379 :     case LTU:
    4759          379 :       return 1;
    4760          212 :     case LE:
    4761          212 :     case LEU:
    4762          212 :       return 2;
    4763         3072 :     case NE:
    4764         3072 :       return 4;
    4765          307 :     case GE:
    4766          307 :     case GEU:
    4767          307 :       return 5;
    4768          502 :     case GT:
    4769          502 :     case GTU:
    4770          502 :       return 6;
    4771            0 :     default:
    4772            0 :       gcc_unreachable ();
    4773              :     }
    4774              : }
    4775              : 
    4776              : /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
    4777              : 
    4778              : static int
    4779         1785 : ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
    4780              : {
    4781         1785 :   switch (code)
    4782              :     {
    4783              :     case EQ:
    4784              :       return 0x00;
    4785          354 :     case NE:
    4786          354 :       return 0x04;
    4787          514 :     case GT:
    4788          514 :       return 0x0e;
    4789           88 :     case LE:
    4790           88 :       return 0x02;
    4791           53 :     case GE:
    4792           53 :       return 0x0d;
    4793          624 :     case LT:
    4794          624 :       return 0x01;
    4795            2 :     case UNLE:
    4796            2 :       return 0x0a;
    4797            2 :     case UNLT:
    4798            2 :       return 0x09;
    4799           11 :     case UNGE:
    4800           11 :       return 0x05;
    4801           44 :     case UNGT:
    4802           44 :       return 0x06;
    4803            2 :     case UNEQ:
    4804            2 :       return 0x18;
    4805            0 :     case LTGT:
    4806            0 :       return 0x0c;
    4807            2 :     case ORDERED:
    4808            2 :       return 0x07;
    4809            2 :     case UNORDERED:
    4810            2 :       return 0x03;
    4811            0 :     default:
    4812            0 :       gcc_unreachable ();
    4813              :     }
    4814              : }
    4815              : 
    4816              : /* Return immediate value to be used in UNSPEC_PCMP
    4817              :    for comparison CODE in MODE.  */
    4818              : 
    4819              : static int
    4820         6670 : ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
    4821              : {
    4822         6670 :   if (FLOAT_MODE_P (mode))
    4823         1785 :     return ix86_fp_cmp_code_to_pcmp_immediate (code);
    4824         4885 :   return ix86_int_cmp_code_to_pcmp_immediate (code);
    4825              : }
    4826              : 
    4827              : /* Expand AVX-512 vector comparison.  */
    4828              : 
    4829              : bool
    4830         6670 : ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
    4831              : {
    4832         6670 :   machine_mode mask_mode = GET_MODE (dest);
    4833         6670 :   machine_mode cmp_mode = GET_MODE (cmp_op0);
    4834         6670 :   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
    4835         6670 :   int unspec_code;
    4836         6670 :   rtx unspec;
    4837              : 
    4838         6670 :   switch (code)
    4839              :     {
    4840              :     case LEU:
    4841              :     case GTU:
    4842              :     case GEU:
    4843              :     case LTU:
    4844              :       unspec_code = UNSPEC_UNSIGNED_PCMP;
    4845              :       break;
    4846              : 
    4847         6256 :     default:
    4848         6256 :       unspec_code = UNSPEC_PCMP;
    4849              :     }
    4850              : 
    4851         6670 :   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
    4852              :                            unspec_code);
    4853         6670 :   emit_insn (gen_rtx_SET (dest, unspec));
    4854              : 
    4855         6670 :   return true;
    4856              : }
    4857              : 
    4858              : /* Expand fp vector comparison.  */
    4859              : 
    4860              : bool
    4861         7195 : ix86_expand_fp_vec_cmp (rtx operands[])
    4862              : {
    4863         7195 :   enum rtx_code code = GET_CODE (operands[1]);
    4864         7195 :   rtx cmp;
    4865              : 
    4866         7195 :   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
    4867              :                                            &operands[2], &operands[3]);
    4868         7195 :   if (code == UNKNOWN)
    4869              :     {
    4870           20 :       rtx temp;
    4871           20 :       switch (GET_CODE (operands[1]))
    4872              :         {
    4873            2 :         case LTGT:
    4874            2 :           temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
    4875              :                                       operands[3], NULL, NULL);
    4876            2 :           cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
    4877              :                                      operands[3], NULL, NULL);
    4878            2 :           code = AND;
    4879            2 :           break;
    4880           18 :         case UNEQ:
    4881           18 :           temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
    4882              :                                       operands[3], NULL, NULL);
    4883           18 :           cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
    4884              :                                      operands[3], NULL, NULL);
    4885           18 :           code = IOR;
    4886           18 :           break;
    4887            0 :         default:
    4888            0 :           gcc_unreachable ();
    4889              :         }
    4890           20 :       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
    4891              :                                  OPTAB_DIRECT);
    4892              :     }
    4893              :   else
    4894         7175 :     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
    4895              :                                NULL, NULL);
    4896              : 
    4897         7195 :   if (operands[0] != cmp)
    4898         7112 :     emit_move_insn (operands[0], cmp);
    4899              : 
    4900         7195 :   return true;
    4901              : }
    4902              : 
    4903              : static rtx
    4904        17155 : ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
    4905              :                          rtx op_true, rtx op_false, bool *negate)
    4906              : {
    4907        17155 :   machine_mode data_mode = GET_MODE (dest);
    4908        17155 :   machine_mode mode = GET_MODE (cop0);
    4909        17155 :   rtx x;
    4910              : 
    4911        17155 :   *negate = false;
    4912              : 
    4913              :   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
    4914        17155 :   if (TARGET_XOP
    4915          201 :       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
    4916        17356 :       && GET_MODE_SIZE (mode) <= 16)
    4917              :     ;
    4918              :   /* AVX512F supports all of the comparsions
    4919              :      on all 128/256/512-bit vector int types.  */
    4920        16957 :   else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
    4921              :     ;
    4922              :   else
    4923              :     {
    4924              :       /* Canonicalize the comparison to EQ, GT, GTU.  */
    4925        16904 :       switch (code)
    4926              :         {
    4927              :         case EQ:
    4928              :         case GT:
    4929              :         case GTU:
    4930              :           break;
    4931              : 
    4932          851 :         case LE:
    4933          851 :         case LEU:
    4934              :           /* x <= cst can be handled as x < cst + 1 unless there is
    4935              :              wrap around in cst + 1.  */
    4936          851 :           if (CONST_VECTOR_P (cop1)
    4937         1425 :               && GET_MODE_INNER (mode) != TImode)
    4938              :             {
    4939          574 :               unsigned int n_elts = GET_MODE_NUNITS (mode), i;
    4940          574 :               machine_mode eltmode = GET_MODE_INNER (mode);
    4941         3659 :               for (i = 0; i < n_elts; ++i)
    4942              :                 {
    4943         3086 :                   rtx elt = CONST_VECTOR_ELT (cop1, i);
    4944         3086 :                   if (!CONST_INT_P (elt))
    4945              :                     break;
    4946         3086 :                   if (code == LE)
    4947              :                     {
    4948              :                       /* For LE punt if some element is signed maximum.  */
    4949         2062 :                       if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
    4950              :                           == (GET_MODE_MASK (eltmode) >> 1))
    4951              :                         break;
    4952              :                     }
    4953              :                   /* For LEU punt if some element is unsigned maximum.  */
    4954         1024 :                   else if (elt == constm1_rtx)
    4955              :                     break;
    4956              :                 }
    4957          574 :               if (i == n_elts)
    4958              :                 {
    4959          573 :                   rtvec v = rtvec_alloc (n_elts);
    4960         4230 :                   for (i = 0; i < n_elts; ++i)
    4961         3084 :                     RTVEC_ELT (v, i)
    4962         3084 :                       = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
    4963              :                                       eltmode);
    4964          573 :                   cop1 = gen_rtx_CONST_VECTOR (mode, v);
    4965          573 :                   std::swap (cop0, cop1);
    4966          573 :                   code = code == LE ? GT : GTU;
    4967              :                   break;
    4968              :                 }
    4969              :             }
    4970              :           /* FALLTHRU */
    4971         3314 :         case NE:
    4972         3314 :           code = reverse_condition (code);
    4973         3314 :           *negate = true;
    4974         3314 :           break;
    4975              : 
    4976          435 :         case GE:
    4977          435 :         case GEU:
    4978              :           /* x >= cst can be handled as x > cst - 1 unless there is
    4979              :              wrap around in cst - 1.  */
    4980          435 :           if (CONST_VECTOR_P (cop1)
    4981          644 :               && GET_MODE_INNER (mode) != TImode)
    4982              :             {
    4983          209 :               unsigned int n_elts = GET_MODE_NUNITS (mode), i;
    4984          209 :               machine_mode eltmode = GET_MODE_INNER (mode);
    4985         1453 :               for (i = 0; i < n_elts; ++i)
    4986              :                 {
    4987         1292 :                   rtx elt = CONST_VECTOR_ELT (cop1, i);
    4988         1292 :                   if (!CONST_INT_P (elt))
    4989              :                     break;
    4990         1292 :                   if (code == GE)
    4991              :                     {
    4992              :                       /* For GE punt if some element is signed minimum.  */
    4993         1244 :                       if (INTVAL (elt) < 0
    4994          136 :                           && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
    4995              :                               == 0))
    4996              :                         break;
    4997              :                     }
    4998              :                   /* For GEU punt if some element is zero.  */
    4999           48 :                   else if (elt == const0_rtx)
    5000              :                     break;
    5001              :                 }
    5002          209 :               if (i == n_elts)
    5003              :                 {
    5004          161 :                   rtvec v = rtvec_alloc (n_elts);
    5005         1566 :                   for (i = 0; i < n_elts; ++i)
    5006         1244 :                     RTVEC_ELT (v, i)
    5007         1244 :                       = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
    5008              :                                       eltmode);
    5009          161 :                   cop1 = gen_rtx_CONST_VECTOR (mode, v);
    5010          161 :                   code = code == GE ? GT : GTU;
    5011              :                   break;
    5012              :                 }
    5013              :             }
    5014          274 :           code = reverse_condition (code);
    5015          274 :           *negate = true;
    5016              :           /* FALLTHRU */
    5017              : 
    5018         1632 :         case LT:
    5019         1632 :         case LTU:
    5020         1632 :           std::swap (cop0, cop1);
    5021         1632 :           code = swap_condition (code);
    5022         1632 :           break;
    5023              : 
    5024            0 :         default:
    5025            0 :           gcc_unreachable ();
    5026              :         }
    5027              : 
    5028              :       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
    5029        16904 :       if (mode == V2DImode)
    5030              :         {
    5031          788 :           switch (code)
    5032              :             {
    5033          584 :             case EQ:
    5034              :               /* SSE4.1 supports EQ.  */
    5035          584 :               if (!TARGET_SSE4_1)
    5036        17155 :                 return NULL;
    5037              :               break;
    5038              : 
    5039          204 :             case GT:
    5040          204 :             case GTU:
    5041              :               /* SSE4.2 supports GT/GTU.  */
    5042          204 :               if (!TARGET_SSE4_2)
    5043              :                 return NULL;
    5044              :               break;
    5045              : 
    5046            0 :             default:
    5047            0 :               gcc_unreachable ();
    5048              :             }
    5049              :         }
    5050              : 
    5051        16904 :       if (CONST_VECTOR_P (cop0))
    5052         1228 :         cop0 = force_reg (mode, cop0);
    5053        15676 :       else if (CONST_VECTOR_P (cop1))
    5054         7221 :         cop1 = force_reg (mode, cop1);
    5055              : 
    5056        16904 :       rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
    5057        16904 :       rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
    5058        16904 :       if (*negate)
    5059         3588 :         std::swap (optrue, opfalse);
    5060              : 
    5061              :       /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
    5062              :          not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
    5063              :          min (x, y) == x).  While we add one instruction (the minimum),
    5064              :          we remove the need for two instructions in the negation, as the
    5065              :          result is done this way.
    5066              :          When using masks, do it for SI/DImode element types, as it is shorter
    5067              :          than the two subtractions.  */
    5068        16904 :       if ((code != EQ
    5069         7202 :            && GET_MODE_SIZE (mode) != 64
    5070         7202 :            && vector_all_ones_operand (opfalse, data_mode)
    5071          552 :            && optrue == CONST0_RTX (data_mode))
    5072        23554 :           || (code == GTU
    5073         1954 :               && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
    5074              :               /* Don't do it if not using integer masks and we'd end up with
    5075              :                  the right values in the registers though.  */
    5076          658 :               && (GET_MODE_SIZE (mode) == 64
    5077          658 :                   || !vector_all_ones_operand (optrue, data_mode)
    5078          541 :                   || opfalse != CONST0_RTX (data_mode))))
    5079              :         {
    5080          669 :           rtx (*gen) (rtx, rtx, rtx) = NULL;
    5081              : 
    5082          669 :           switch (mode)
    5083              :             {
    5084            0 :             case E_V16SImode:
    5085            0 :               gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
    5086              :               break;
    5087            0 :             case E_V8DImode:
    5088            0 :               gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
    5089            0 :               cop0 = force_reg (mode, cop0);
    5090            0 :               cop1 = force_reg (mode, cop1);
    5091            0 :               break;
    5092           24 :             case E_V32QImode:
    5093           24 :               if (TARGET_AVX2)
    5094           24 :                 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
    5095              :               break;
    5096           24 :             case E_V16HImode:
    5097           24 :               if (TARGET_AVX2)
    5098           24 :                 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
    5099              :               break;
    5100           25 :             case E_V8SImode:
    5101           25 :               if (TARGET_AVX2)
    5102           25 :                 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
    5103              :               break;
    5104           20 :             case E_V4DImode:
    5105           20 :               if (TARGET_AVX512VL)
    5106              :                 {
    5107            0 :                   gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
    5108            0 :                   cop0 = force_reg (mode, cop0);
    5109            0 :                   cop1 = force_reg (mode, cop1);
    5110              :                 }
    5111              :               break;
    5112           60 :             case E_V16QImode:
    5113           60 :               if (code == GTU && TARGET_SSE2)
    5114              :                 gen = gen_uminv16qi3;
    5115           24 :               else if (code == GT && TARGET_SSE4_1)
    5116              :                 gen = gen_sminv16qi3;
    5117              :               break;
    5118           40 :             case E_V8QImode:
    5119           40 :               if (code == GTU && TARGET_SSE2)
    5120              :                 gen = gen_uminv8qi3;
    5121           38 :               else if (code == GT && TARGET_SSE4_1)
    5122              :                 gen = gen_sminv8qi3;
    5123              :               break;
    5124           13 :             case E_V4QImode:
    5125           13 :               if (code == GTU && TARGET_SSE2)
    5126              :                 gen = gen_uminv4qi3;
    5127            2 :               else if (code == GT && TARGET_SSE4_1)
    5128              :                 gen = gen_sminv4qi3;
    5129              :               break;
    5130            8 :             case E_V2QImode:
    5131            8 :               if (code == GTU && TARGET_SSE2)
    5132              :                 gen = gen_uminv2qi3;
    5133            6 :               else if (code == GT && TARGET_SSE4_1)
    5134              :                 gen = gen_sminv2qi3;
    5135              :               break;
    5136           69 :             case E_V8HImode:
    5137           69 :               if (code == GTU && TARGET_SSE4_1)
    5138              :                 gen = gen_uminv8hi3;
    5139           59 :               else if (code == GT && TARGET_SSE2)
    5140              :                 gen = gen_sminv8hi3;
    5141              :               break;
    5142            4 :             case E_V4HImode:
    5143            4 :               if (code == GTU && TARGET_SSE4_1)
    5144              :                 gen = gen_uminv4hi3;
    5145            4 :               else if (code == GT && TARGET_SSE2)
    5146              :                 gen = gen_sminv4hi3;
    5147              :               break;
    5148           16 :             case E_V2HImode:
    5149           16 :               if (code == GTU && TARGET_SSE4_1)
    5150              :                 gen = gen_uminv2hi3;
    5151           16 :               else if (code == GT && TARGET_SSE2)
    5152              :                 gen = gen_sminv2hi3;
    5153              :               break;
    5154          239 :             case E_V4SImode:
    5155          239 :               if (TARGET_SSE4_1)
    5156           52 :                 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
    5157              :               break;
    5158          103 :             case E_V2SImode:
    5159          103 :               if (TARGET_SSE4_1)
    5160            0 :                 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
    5161              :               break;
    5162           24 :             case E_V2DImode:
    5163           24 :               if (TARGET_AVX512VL)
    5164              :                 {
    5165            0 :                   gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
    5166            0 :                   cop0 = force_reg (mode, cop0);
    5167            0 :                   cop1 = force_reg (mode, cop1);
    5168              :                 }
    5169              :               break;
    5170              :             default:
    5171              :               break;
    5172              :             }
    5173              : 
    5174            0 :           if (gen)
    5175              :             {
    5176          276 :               rtx tem = gen_reg_rtx (mode);
    5177          276 :               if (!vector_operand (cop0, mode))
    5178            0 :                 cop0 = force_reg (mode, cop0);
    5179          276 :               if (!vector_operand (cop1, mode))
    5180            0 :                 cop1 = force_reg (mode, cop1);
    5181          276 :               *negate = !*negate;
    5182          276 :               emit_insn (gen (tem, cop0, cop1));
    5183          276 :               cop1 = tem;
    5184          276 :               code = EQ;
    5185              :             }
    5186              :         }
    5187              : 
    5188              :       /* Unsigned parallel compare is not supported by the hardware.
    5189              :          Play some tricks to turn this into a signed comparison
    5190              :          against 0.  */
    5191        16904 :       if (code == GTU)
    5192              :         {
    5193         1111 :           cop0 = force_reg (mode, cop0);
    5194              : 
    5195         1111 :           switch (mode)
    5196              :             {
    5197          761 :             case E_V16SImode:
    5198          761 :             case E_V8DImode:
    5199          761 :             case E_V8SImode:
    5200          761 :             case E_V4DImode:
    5201          761 :             case E_V4SImode:
    5202          761 :             case E_V2SImode:
    5203          761 :             case E_V2DImode:
    5204          761 :                 {
    5205          761 :                   rtx t1, t2, mask;
    5206              : 
    5207              :                   /* Subtract (-(INT MAX) - 1) from both operands to make
    5208              :                      them signed.  */
    5209          761 :                   mask = ix86_build_signbit_mask (mode, true, false);
    5210          761 :                   t1 = gen_reg_rtx (mode);
    5211          761 :                   emit_insn (gen_sub3_insn (t1, cop0, mask));
    5212              : 
    5213          761 :                   t2 = gen_reg_rtx (mode);
    5214          761 :                   emit_insn (gen_sub3_insn (t2, cop1, mask));
    5215              : 
    5216          761 :                   cop0 = t1;
    5217          761 :                   cop1 = t2;
    5218          761 :                   code = GT;
    5219              :                 }
    5220          761 :               break;
    5221              : 
    5222          350 :             case E_V64QImode:
    5223          350 :             case E_V32HImode:
    5224          350 :             case E_V32QImode:
    5225          350 :             case E_V16HImode:
    5226          350 :             case E_V16QImode:
    5227          350 :             case E_V8QImode:
    5228          350 :             case E_V4QImode:
    5229          350 :             case E_V2QImode:
    5230          350 :             case E_V8HImode:
    5231          350 :             case E_V4HImode:
    5232          350 :             case E_V2HImode:
    5233              :               /* Perform a parallel unsigned saturating subtraction.  */
    5234          350 :               x = gen_reg_rtx (mode);
    5235          350 :               emit_insn (gen_rtx_SET
    5236              :                          (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
    5237          350 :               cop0 = x;
    5238          350 :               cop1 = CONST0_RTX (mode);
    5239          350 :               code = EQ;
    5240          350 :               *negate = !*negate;
    5241          350 :               break;
    5242              : 
    5243            0 :             default:
    5244            0 :               gcc_unreachable ();
    5245              :             }
    5246              :         }
    5247              :     }
    5248              : 
    5249        17155 :   if (*negate)
    5250         3600 :     std::swap (op_true, op_false);
    5251              : 
    5252        17155 :   if (CONST_VECTOR_P (cop1))
    5253          419 :     cop1 = force_reg (mode, cop1);
    5254              : 
    5255              :   /* Allow the comparison to be done in one mode, but the movcc to
    5256              :      happen in another mode.  */
    5257        17155 :   if (data_mode == mode)
    5258        17113 :     x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
    5259              :   else
    5260              :     {
    5261          126 :       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
    5262           42 :       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
    5263              :                                op_true, op_false);
    5264           42 :       if (GET_MODE (x) == mode)
    5265           24 :         x = gen_lowpart (data_mode, x);
    5266              :     }
    5267              : 
    5268              :   return x;
    5269              : }
    5270              : 
    5271              : /* Expand integer vector comparison.  */
    5272              : 
    5273              : bool
    5274        10393 : ix86_expand_int_vec_cmp (rtx operands[])
    5275              : {
    5276        10393 :   rtx_code code = GET_CODE (operands[1]);
    5277        10393 :   bool negate = false;
    5278        10393 :   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
    5279              :                                      operands[3], NULL, NULL, &negate);
    5280              : 
    5281        10393 :   if (!cmp)
    5282              :     return false;
    5283              : 
    5284        10393 :   if (negate)
    5285              :     {
    5286         3630 :       if (TARGET_AVX512F && GET_MODE_SIZE (GET_MODE (cmp)) >= 16)
    5287           91 :         cmp = gen_rtx_XOR (GET_MODE (cmp), cmp, CONSTM1_RTX (GET_MODE (cmp)));
    5288              :       else
    5289              :         {
    5290         6826 :           cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
    5291         3413 :                                          CONST0_RTX (GET_MODE (cmp)),
    5292              :                                          NULL, NULL, &negate);
    5293         3413 :           gcc_assert (!negate);
    5294              :         }
    5295              :     }
    5296              : 
    5297        10393 :   if (operands[0] != cmp)
    5298        10098 :     emit_move_insn (operands[0], cmp);
    5299              : 
    5300              :   return true;
    5301              : }
    5302              : 
    5303              : /* Expand a floating-point vector conditional move; a vcond operation
    5304              :    rather than a movcc operation.  */
    5305              : 
    5306              : bool
    5307            0 : ix86_expand_fp_vcond (rtx operands[])
    5308              : {
    5309            0 :   enum rtx_code code = GET_CODE (operands[3]);
    5310            0 :   rtx cmp;
    5311              : 
    5312            0 :   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
    5313              :                                            &operands[4], &operands[5]);
    5314            0 :   if (code == UNKNOWN)
    5315              :     {
    5316            0 :       rtx temp;
    5317            0 :       switch (GET_CODE (operands[3]))
    5318              :         {
    5319            0 :         case LTGT:
    5320            0 :           temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
    5321              :                                       operands[5], operands[0], operands[0]);
    5322            0 :           cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
    5323              :                                      operands[5], operands[1], operands[2]);
    5324            0 :           code = AND;
    5325            0 :           break;
    5326            0 :         case UNEQ:
    5327            0 :           temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
    5328              :                                       operands[5], operands[0], operands[0]);
    5329            0 :           cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
    5330              :                                      operands[5], operands[1], operands[2]);
    5331            0 :           code = IOR;
    5332            0 :           break;
    5333            0 :         default:
    5334            0 :           gcc_unreachable ();
    5335              :         }
    5336            0 :       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
    5337              :                                  OPTAB_DIRECT);
    5338            0 :       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
    5339            0 :       return true;
    5340              :     }
    5341              : 
    5342            0 :   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
    5343              :                                  operands[5], operands[1], operands[2]))
    5344              :     return true;
    5345              : 
    5346            0 :   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
    5347              :                              operands[1], operands[2]);
    5348            0 :   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
    5349            0 :   return true;
    5350              : }
    5351              : 
    5352              : /* Expand a signed/unsigned integral vector conditional move.  */
    5353              : 
    5354              : bool
    5355         3349 : ix86_expand_int_vcond (rtx operands[])
    5356              : {
    5357         3349 :   machine_mode data_mode = GET_MODE (operands[0]);
    5358         3349 :   machine_mode mode = GET_MODE (operands[4]);
    5359         3349 :   enum rtx_code code = GET_CODE (operands[3]);
    5360         3349 :   bool negate = false;
    5361         3349 :   rtx x, cop0, cop1;
    5362              : 
    5363         3349 :   cop0 = operands[4];
    5364         3349 :   cop1 = operands[5];
    5365              : 
    5366              :   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
    5367              :      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
    5368         3349 :   if ((code == LT || code == GE)
    5369            0 :       && data_mode == mode
    5370            0 :       && cop1 == CONST0_RTX (mode)
    5371            0 :       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
    5372            0 :       && GET_MODE_UNIT_SIZE (data_mode) > 1
    5373            0 :       && GET_MODE_UNIT_SIZE (data_mode) <= 8
    5374         3349 :       && (GET_MODE_SIZE (data_mode) == 16
    5375            0 :           || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
    5376              :     {
    5377            0 :       rtx negop = operands[2 - (code == LT)];
    5378            0 :       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
    5379            0 :       if (negop == CONST1_RTX (data_mode))
    5380              :         {
    5381            0 :           rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
    5382              :                                          operands[0], 1, OPTAB_DIRECT);
    5383            0 :           if (res != operands[0])
    5384            0 :             emit_move_insn (operands[0], res);
    5385            0 :           return true;
    5386              :         }
    5387            0 :       else if (GET_MODE_INNER (data_mode) != DImode
    5388            0 :                && vector_all_ones_operand (negop, data_mode))
    5389              :         {
    5390            0 :           rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
    5391              :                                          operands[0], 0, OPTAB_DIRECT);
    5392            0 :           if (res != operands[0])
    5393            0 :             emit_move_insn (operands[0], res);
    5394            0 :           return true;
    5395              :         }
    5396              :     }
    5397              : 
    5398         3349 :   if (!nonimmediate_operand (cop1, mode))
    5399          126 :     cop1 = force_reg (mode, cop1);
    5400         3349 :   if (!general_operand (operands[1], data_mode))
    5401            0 :     operands[1] = force_reg (data_mode, operands[1]);
    5402         3349 :   if (!general_operand (operands[2], data_mode))
    5403            0 :     operands[2] = force_reg (data_mode, operands[2]);
    5404              : 
    5405         3349 :   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
    5406              :                                operands[1], operands[2], &negate);
    5407              : 
    5408         3349 :   if (!x)
    5409              :     return false;
    5410              : 
    5411         3349 :   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
    5412         3349 :                          operands[2-negate]);
    5413         3349 :   return true;
    5414              : }
    5415              : 
    5416              : static bool
    5417       123160 : ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
    5418              :                               struct expand_vec_perm_d *d)
    5419              : {
    5420              :   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
    5421              :      expander, so args are either in d, or in op0, op1 etc.  */
    5422       123160 :   machine_mode mode = GET_MODE (d ? d->op0 : op0);
    5423       123160 :   machine_mode maskmode = mode;
    5424       123160 :   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
    5425              : 
    5426       123160 :   switch (mode)
    5427              :     {
    5428        23432 :     case E_V16QImode:
    5429        23432 :       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
    5430              :         gen = gen_avx512vl_vpermt2varv16qi3;
    5431              :       break;
    5432          521 :     case E_V32QImode:
    5433          521 :       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
    5434              :         gen = gen_avx512vl_vpermt2varv32qi3;
    5435              :       break;
    5436          198 :     case E_V64QImode:
    5437          198 :       if (TARGET_AVX512VBMI)
    5438              :         gen = gen_avx512bw_vpermt2varv64qi3;
    5439              :       break;
    5440        13191 :     case E_V8HImode:
    5441        13191 :       if (TARGET_AVX512VL && TARGET_AVX512BW)
    5442              :         gen = gen_avx512vl_vpermt2varv8hi3;
    5443              :       break;
    5444          775 :     case E_V16HImode:
    5445          775 :       if (TARGET_AVX512VL && TARGET_AVX512BW)
    5446              :         gen = gen_avx512vl_vpermt2varv16hi3;
    5447              :       break;
    5448          331 :     case E_V32HImode:
    5449          331 :       if (TARGET_AVX512BW)
    5450              :         gen = gen_avx512bw_vpermt2varv32hi3;
    5451              :       break;
    5452        33346 :     case E_V4SImode:
    5453        33346 :       if (TARGET_AVX512VL)
    5454              :         gen = gen_avx512vl_vpermt2varv4si3;
    5455              :       break;
    5456         1169 :     case E_V8SImode:
    5457         1169 :       if (TARGET_AVX512VL)
    5458              :         gen = gen_avx512vl_vpermt2varv8si3;
    5459              :       break;
    5460          126 :     case E_V16SImode:
    5461          126 :       if (TARGET_AVX512F)
    5462              :         gen = gen_avx512f_vpermt2varv16si3;
    5463              :       break;
    5464        10333 :     case E_V4SFmode:
    5465        10333 :       if (TARGET_AVX512VL)
    5466              :         {
    5467              :           gen = gen_avx512vl_vpermt2varv4sf3;
    5468              :           maskmode = V4SImode;
    5469              :         }
    5470              :       break;
    5471         6063 :     case E_V8SFmode:
    5472         6063 :       if (TARGET_AVX512VL)
    5473              :         {
    5474              :           gen = gen_avx512vl_vpermt2varv8sf3;
    5475              :           maskmode = V8SImode;
    5476              :         }
    5477              :       break;
    5478          239 :     case E_V16SFmode:
    5479          239 :       if (TARGET_AVX512F)
    5480              :         {
    5481              :           gen = gen_avx512f_vpermt2varv16sf3;
    5482              :           maskmode = V16SImode;
    5483              :         }
    5484              :       break;
    5485            2 :     case E_V2DImode:
    5486            2 :       if (TARGET_AVX512VL)
    5487              :         gen = gen_avx512vl_vpermt2varv2di3;
    5488              :       break;
    5489          292 :     case E_V4DImode:
    5490          292 :       if (TARGET_AVX512VL)
    5491              :         gen = gen_avx512vl_vpermt2varv4di3;
    5492              :       break;
    5493           10 :     case E_V8DImode:
    5494           10 :       if (TARGET_AVX512F)
    5495              :         gen = gen_avx512f_vpermt2varv8di3;
    5496              :       break;
    5497            2 :     case E_V2DFmode:
    5498            2 :       if (TARGET_AVX512VL)
    5499              :         {
    5500              :           gen = gen_avx512vl_vpermt2varv2df3;
    5501              :           maskmode = V2DImode;
    5502              :         }
    5503              :       break;
    5504         1848 :     case E_V4DFmode:
    5505         1848 :       if (TARGET_AVX512VL)
    5506              :         {
    5507              :           gen = gen_avx512vl_vpermt2varv4df3;
    5508              :           maskmode = V4DImode;
    5509              :         }
    5510              :       break;
    5511          186 :     case E_V8DFmode:
    5512          186 :       if (TARGET_AVX512F)
    5513              :         {
    5514              :           gen = gen_avx512f_vpermt2varv8df3;
    5515              :           maskmode = V8DImode;
    5516              :         }
    5517              :       break;
    5518              :     default:
    5519              :       break;
    5520              :     }
    5521              : 
    5522              :   if (gen == NULL)
    5523              :     return false;
    5524              : 
    5525          909 :   if (d && d->testing_p)
    5526              :     return true;
    5527              : 
    5528              :   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
    5529              :      expander, so args are either in d, or in op0, op1 etc.  */
    5530          898 :   if (d)
    5531              :     {
    5532          898 :       rtx vec[64];
    5533          898 :       target = d->target;
    5534          898 :       op0 = d->op0;
    5535          898 :       op1 = d->op1;
    5536        15622 :       for (int i = 0; i < d->nelt; ++i)
    5537        14724 :         vec[i] = GEN_INT (d->perm[i]);
    5538          898 :       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
    5539              :     }
    5540              : 
    5541          906 :   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
    5542          906 :   return true;
    5543              : }
    5544              : 
    5545              : /* Expand a variable vector permutation.  */
    5546              : 
    5547              : void
    5548           18 : ix86_expand_vec_perm (rtx operands[])
    5549              : {
    5550           18 :   rtx target = operands[0];
    5551           18 :   rtx op0 = operands[1];
    5552           18 :   rtx op1 = operands[2];
    5553           18 :   rtx mask = operands[3];
    5554           18 :   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
    5555           18 :   machine_mode mode = GET_MODE (op0);
    5556           18 :   machine_mode maskmode = GET_MODE (mask);
    5557           18 :   int w, e, i;
    5558           18 :   bool one_operand_shuffle = rtx_equal_p (op0, op1);
    5559              : 
    5560              :   /* Number of elements in the vector.  */
    5561           18 :   w = GET_MODE_NUNITS (mode);
    5562           18 :   e = GET_MODE_UNIT_SIZE (mode);
    5563           18 :   gcc_assert (w <= 64);
    5564              : 
    5565              :   /* For HF mode vector, convert it to HI using subreg.  */
    5566           36 :   if (GET_MODE_INNER (mode) == HFmode)
    5567              :     {
    5568            6 :       machine_mode orig_mode = mode;
    5569            6 :       mode = mode_for_vector (HImode, w).require ();
    5570            6 :       target = lowpart_subreg (mode, target, orig_mode);
    5571            6 :       op0 = lowpart_subreg (mode, op0, orig_mode);
    5572            6 :       op1 = lowpart_subreg (mode, op1, orig_mode);
    5573              :     }
    5574              : 
    5575           18 :   if (TARGET_AVX512F && one_operand_shuffle)
    5576              :     {
    5577            5 :       rtx (*gen) (rtx, rtx, rtx) = NULL;
    5578            5 :       switch (mode)
    5579              :         {
    5580              :         case E_V16SImode:
    5581              :           gen = gen_avx512f_permvarv16si;
    5582              :           break;
    5583            0 :         case E_V16SFmode:
    5584            0 :           gen = gen_avx512f_permvarv16sf;
    5585            0 :           break;
    5586            0 :         case E_V8DImode:
    5587            0 :           gen = gen_avx512f_permvarv8di;
    5588            0 :           break;
    5589            0 :         case E_V8DFmode:
    5590            0 :           gen = gen_avx512f_permvarv8df;
    5591            0 :           break;
    5592              :         default:
    5593              :           break;
    5594              :         }
    5595            0 :       if (gen != NULL)
    5596              :         {
    5597            0 :           emit_insn (gen (target, op0, mask));
    5598           16 :           return;
    5599              :         }
    5600              :     }
    5601              : 
    5602           18 :   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
    5603              :     return;
    5604              : 
    5605           10 :   if (TARGET_AVX2)
    5606              :     {
    5607            5 :       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
    5608              :         {
    5609              :           /* Unfortunately, the VPERMQ and VPERMPD instructions only support
    5610              :              an constant shuffle operand.  With a tiny bit of effort we can
    5611              :              use VPERMD instead.  A re-interpretation stall for V4DFmode is
    5612              :              unfortunate but there's no avoiding it.
    5613              :              Similarly for V16HImode we don't have instructions for variable
    5614              :              shuffling, while for V32QImode we can use after preparing suitable
    5615              :              masks vpshufb; vpshufb; vpermq; vpor.  */
    5616              : 
    5617              :           if (mode == V16HImode)
    5618              :             {
    5619              :               maskmode = mode = V32QImode;
    5620              :               w = 32;
    5621              :               e = 1;
    5622              :             }
    5623              :           else
    5624              :             {
    5625              :               maskmode = mode = V8SImode;
    5626              :               w = 8;
    5627              :               e = 4;
    5628              :             }
    5629            0 :           t1 = gen_reg_rtx (maskmode);
    5630              : 
    5631              :           /* Replicate the low bits of the V4DImode mask into V8SImode:
    5632              :                mask = { A B C D }
    5633              :                t1 = { A A B B C C D D }.  */
    5634            0 :           for (i = 0; i < w / 2; ++i)
    5635            0 :             vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
    5636            0 :           vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
    5637            0 :           vt = force_reg (maskmode, vt);
    5638            0 :           mask = gen_lowpart (maskmode, mask);
    5639            0 :           if (maskmode == V8SImode)
    5640            0 :             emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
    5641              :           else
    5642            0 :             emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
    5643              : 
    5644              :           /* Multiply the shuffle indicies by two.  */
    5645            0 :           t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
    5646              :                                     OPTAB_DIRECT);
    5647              : 
    5648              :           /* Add one to the odd shuffle indicies:
    5649              :                 t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
    5650            0 :           for (i = 0; i < w / 2; ++i)
    5651              :             {
    5652            0 :               vec[i * 2] = const0_rtx;
    5653            0 :               vec[i * 2 + 1] = const1_rtx;
    5654              :             }
    5655            0 :           vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
    5656            0 :           vt = validize_mem (force_const_mem (maskmode, vt));
    5657            0 :           t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
    5658              :                                     OPTAB_DIRECT);
    5659              : 
    5660              :           /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
    5661            0 :           operands[3] = mask = t1;
    5662            0 :           target = gen_reg_rtx (mode);
    5663            0 :           op0 = gen_lowpart (mode, op0);
    5664            0 :           op1 = gen_lowpart (mode, op1);
    5665              :         }
    5666              : 
    5667            5 :       switch (mode)
    5668              :         {
    5669            1 :         case E_V8SImode:
    5670              :           /* The VPERMD and VPERMPS instructions already properly ignore
    5671              :              the high bits of the shuffle elements.  No need for us to
    5672              :              perform an AND ourselves.  */
    5673            1 :           if (one_operand_shuffle)
    5674              :             {
    5675            0 :               emit_insn (gen_avx2_permvarv8si (target, op0, mask));
    5676            0 :               if (target != operands[0])
    5677            0 :                 emit_move_insn (operands[0],
    5678            0 :                                 gen_lowpart (GET_MODE (operands[0]), target));
    5679              :             }
    5680              :           else
    5681              :             {
    5682            1 :               t1 = gen_reg_rtx (V8SImode);
    5683            1 :               t2 = gen_reg_rtx (V8SImode);
    5684            1 :               emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
    5685            1 :               emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
    5686            1 :               goto merge_two;
    5687              :             }
    5688            0 :           return;
    5689              : 
    5690            0 :         case E_V8SFmode:
    5691            0 :           mask = gen_lowpart (V8SImode, mask);
    5692            0 :           if (one_operand_shuffle)
    5693            0 :             emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
    5694              :           else
    5695              :             {
    5696            0 :               t1 = gen_reg_rtx (V8SFmode);
    5697            0 :               t2 = gen_reg_rtx (V8SFmode);
    5698            0 :               emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
    5699            0 :               emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
    5700            0 :               goto merge_two;
    5701              :             }
    5702            0 :           return;
    5703              : 
    5704            1 :         case E_V4SImode:
    5705            1 :           if (one_operand_shuffle)
    5706              :             break; /* Handled below for TARGET_AVX.  */
    5707              :           /* By combining the two 128-bit input vectors into one 256-bit
    5708              :              input vector, we can use VPERMD and VPERMPS for the full
    5709              :              two-operand shuffle.  */
    5710            0 :           t1 = gen_reg_rtx (V8SImode);
    5711            0 :           t2 = gen_reg_rtx (V8SImode);
    5712            0 :           emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
    5713            0 :           emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
    5714            0 :           emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
    5715            0 :           emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
    5716            0 :           return;
    5717              : 
    5718            1 :         case E_V4SFmode:
    5719            1 :           if (one_operand_shuffle)
    5720              :             break; /* Handled below for TARGET_AVX.  */
    5721            0 :           t1 = gen_reg_rtx (V8SFmode);
    5722            0 :           t2 = gen_reg_rtx (V8SImode);
    5723            0 :           mask = gen_lowpart (V4SImode, mask);
    5724            0 :           emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
    5725            0 :           emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
    5726            0 :           emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
    5727            0 :           emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
    5728            0 :           return;
    5729              : 
    5730            0 :         case E_V32QImode:
    5731            0 :           t1 = gen_reg_rtx (V32QImode);
    5732            0 :           t2 = gen_reg_rtx (V32QImode);
    5733            0 :           t3 = gen_reg_rtx (V32QImode);
    5734            0 :           vt2 = GEN_INT (-128);
    5735            0 :           vt = gen_const_vec_duplicate (V32QImode, vt2);
    5736            0 :           vt = force_reg (V32QImode, vt);
    5737            0 :           for (i = 0; i < 32; i++)
    5738            0 :             vec[i] = i < 16 ? vt2 : const0_rtx;
    5739            0 :           vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
    5740            0 :           vt2 = force_reg (V32QImode, vt2);
    5741              :           /* From mask create two adjusted masks, which contain the same
    5742              :              bits as mask in the low 7 bits of each vector element.
    5743              :              The first mask will have the most significant bit clear
    5744              :              if it requests element from the same 128-bit lane
    5745              :              and MSB set if it requests element from the other 128-bit lane.
    5746              :              The second mask will have the opposite values of the MSB,
    5747              :              and additionally will have its 128-bit lanes swapped.
    5748              :              E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
    5749              :              t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
    5750              :              t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
    5751              :              stands for other 12 bytes.  */
    5752              :           /* The bit whether element is from the same lane or the other
    5753              :              lane is bit 4, so shift it up by 3 to the MSB position.  */
    5754            0 :           t5 = gen_reg_rtx (V4DImode);
    5755            0 :           emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
    5756              :                                     GEN_INT (3)));
    5757              :           /* Clear MSB bits from the mask just in case it had them set.  */
    5758            0 :           emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
    5759              :           /* After this t1 will have MSB set for elements from other lane.  */
    5760            0 :           emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
    5761              :           /* Clear bits other than MSB.  */
    5762            0 :           emit_insn (gen_andv32qi3 (t1, t1, vt));
    5763              :           /* Or in the lower bits from mask into t3.  */
    5764            0 :           emit_insn (gen_iorv32qi3 (t3, t1, t2));
    5765              :           /* And invert MSB bits in t1, so MSB is set for elements from the same
    5766              :              lane.  */
    5767            0 :           emit_insn (gen_xorv32qi3 (t1, t1, vt));
    5768              :           /* Swap 128-bit lanes in t3.  */
    5769            0 :           t6 = gen_reg_rtx (V4DImode);
    5770            0 :           emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
    5771              :                                           const2_rtx, GEN_INT (3),
    5772              :                                           const0_rtx, const1_rtx));
    5773              :           /* And or in the lower bits from mask into t1.  */
    5774            0 :           emit_insn (gen_iorv32qi3 (t1, t1, t2));
    5775            0 :           if (one_operand_shuffle)
    5776              :             {
    5777              :               /* Each of these shuffles will put 0s in places where
    5778              :                  element from the other 128-bit lane is needed, otherwise
    5779              :                  will shuffle in the requested value.  */
    5780            0 :               emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
    5781            0 :                                                 gen_lowpart (V32QImode, t6)));
    5782            0 :               emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
    5783              :               /* For t3 the 128-bit lanes are swapped again.  */
    5784            0 :               t7 = gen_reg_rtx (V4DImode);
    5785            0 :               emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
    5786              :                                               const2_rtx, GEN_INT (3),
    5787              :                                               const0_rtx, const1_rtx));
    5788              :               /* And oring both together leads to the result.  */
    5789            0 :               emit_insn (gen_iorv32qi3 (target, t1,
    5790            0 :                                         gen_lowpart (V32QImode, t7)));
    5791            0 :               if (target != operands[0])
    5792            0 :                 emit_move_insn (operands[0],
    5793            0 :                                 gen_lowpart (GET_MODE (operands[0]), target));
    5794            0 :               return;
    5795              :             }
    5796              : 
    5797            0 :           t4 = gen_reg_rtx (V32QImode);
    5798              :           /* Similarly to the above one_operand_shuffle code,
    5799              :              just for repeated twice for each operand.  merge_two:
    5800              :              code will merge the two results together.  */
    5801            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
    5802            0 :                                             gen_lowpart (V32QImode, t6)));
    5803            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
    5804            0 :                                             gen_lowpart (V32QImode, t6)));
    5805            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
    5806            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
    5807            0 :           t7 = gen_reg_rtx (V4DImode);
    5808            0 :           emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
    5809              :                                           const2_rtx, GEN_INT (3),
    5810              :                                           const0_rtx, const1_rtx));
    5811            0 :           t8 = gen_reg_rtx (V4DImode);
    5812            0 :           emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
    5813              :                                           const2_rtx, GEN_INT (3),
    5814              :                                           const0_rtx, const1_rtx));
    5815            0 :           emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
    5816            0 :           emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
    5817            0 :           t1 = t4;
    5818            0 :           t2 = t3;
    5819            0 :           goto merge_two;
    5820              : 
    5821            2 :         default:
    5822            4 :           gcc_assert (GET_MODE_SIZE (mode) <= 16);
    5823              :           break;
    5824              :         }
    5825              :     }
    5826              : 
    5827            9 :   if (TARGET_AVX && one_operand_shuffle)
    5828            8 :     switch (mode)
    5829              :       {
    5830            2 :       case V4SImode:
    5831            2 :         op0 = gen_lowpart (V4SFmode, op0);
    5832            2 :         t1 = gen_reg_rtx (V4SFmode);
    5833            2 :         emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask));
    5834            2 :         emit_move_insn (target, gen_lowpart (mode, t1));
    5835            2 :         return;
    5836            2 :       case V4SFmode:
    5837            2 :         emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask));
    5838            2 :         return;
    5839            2 :       case V2DImode:
    5840            2 :         op0 = gen_lowpart (V2DFmode, op0);
    5841            2 :         t1 = gen_reg_rtx (V2DImode);
    5842            2 :         t2 = gen_reg_rtx (V2DFmode);
    5843            2 :         emit_insn (gen_addv2di3 (t1, mask, mask));
    5844            2 :         emit_insn (gen_avx_vpermilvarv2df3 (t2, op0, t1));
    5845            2 :         emit_move_insn (target, gen_lowpart (mode, t2));
    5846            2 :         return;
    5847            2 :       case V2DFmode:
    5848            2 :         t1 = gen_reg_rtx (V2DImode);
    5849            2 :         emit_insn (gen_addv2di3 (t1, mask, mask));
    5850            2 :         emit_insn (gen_avx_vpermilvarv2df3 (target, op0, t1));
    5851            2 :         return;
    5852              :       default:
    5853              :         break;
    5854              :       }
    5855              : 
    5856            1 :   if (TARGET_XOP)
    5857              :     {
    5858              :       /* The XOP VPPERM insn supports three inputs.  By ignoring the
    5859              :          one_operand_shuffle special case, we avoid creating another
    5860              :          set of constant vectors in memory.  */
    5861            0 :       one_operand_shuffle = false;
    5862              : 
    5863              :       /* mask = mask & {2*w-1, ...} */
    5864            0 :       vt = GEN_INT (2*w - 1);
    5865              :     }
    5866              :   else
    5867              :     {
    5868              :       /* mask = mask & {w-1, ...} */
    5869            1 :       vt = GEN_INT (w - 1);
    5870              :     }
    5871              : 
    5872            1 :   vt = gen_const_vec_duplicate (maskmode, vt);
    5873            1 :   mask = expand_simple_binop (maskmode, AND, mask, vt,
    5874              :                               NULL_RTX, 0, OPTAB_DIRECT);
    5875              : 
    5876              :   /* For non-QImode operations, convert the word permutation control
    5877              :      into a byte permutation control.  */
    5878            1 :   if (mode != V16QImode)
    5879              :     {
    5880            1 :       mask = expand_simple_binop (maskmode, ASHIFT, mask,
    5881            2 :                                   GEN_INT (exact_log2 (e)),
    5882              :                                   NULL_RTX, 0, OPTAB_DIRECT);
    5883              : 
    5884              :       /* Convert mask to vector of chars.  */
    5885            1 :       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
    5886              : 
    5887              :       /* Replicate each of the input bytes into byte positions:
    5888              :          (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
    5889              :          (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
    5890              :          (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
    5891           18 :       for (i = 0; i < 16; ++i)
    5892           16 :         vec[i] = GEN_INT (i/e * e);
    5893            1 :       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
    5894            1 :       vt = validize_mem (force_const_mem (V16QImode, vt));
    5895            1 :       if (TARGET_XOP)
    5896            0 :         emit_insn (gen_xop_pperm (mask, mask, mask, vt));
    5897              :       else
    5898            1 :         emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
    5899              : 
    5900              :       /* Convert it into the byte positions by doing
    5901              :          mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
    5902           17 :       for (i = 0; i < 16; ++i)
    5903           16 :         vec[i] = GEN_INT (i % e);
    5904            1 :       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
    5905            1 :       vt = validize_mem (force_const_mem (V16QImode, vt));
    5906            1 :       emit_insn (gen_addv16qi3 (mask, mask, vt));
    5907              :     }
    5908              : 
    5909              :   /* The actual shuffle operations all operate on V16QImode.  */
    5910            1 :   op0 = gen_lowpart (V16QImode, op0);
    5911            1 :   op1 = gen_lowpart (V16QImode, op1);
    5912              : 
    5913            1 :   if (TARGET_XOP)
    5914              :     {
    5915            0 :       if (GET_MODE (target) != V16QImode)
    5916            0 :         target = gen_reg_rtx (V16QImode);
    5917            0 :       emit_insn (gen_xop_pperm (target, op0, op1, mask));
    5918            0 :       if (target != operands[0])
    5919            0 :         emit_move_insn (operands[0],
    5920            0 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5921              :     }
    5922            1 :   else if (one_operand_shuffle)
    5923              :     {
    5924            1 :       if (GET_MODE (target) != V16QImode)
    5925            1 :         target = gen_reg_rtx (V16QImode);
    5926            1 :       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
    5927            1 :       if (target != operands[0])
    5928            1 :         emit_move_insn (operands[0],
    5929            1 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5930              :     }
    5931              :   else
    5932              :     {
    5933            0 :       rtx xops[6];
    5934            0 :       bool ok;
    5935              : 
    5936              :       /* Shuffle the two input vectors independently.  */
    5937            0 :       t1 = gen_reg_rtx (V16QImode);
    5938            0 :       t2 = gen_reg_rtx (V16QImode);
    5939            0 :       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
    5940            0 :       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
    5941              : 
    5942            1 :  merge_two:
    5943              :       /* Then merge them together.  The key is whether any given control
    5944              :          element contained a bit set that indicates the second word.  */
    5945            1 :       mask = operands[3];
    5946            1 :       vt = GEN_INT (w);
    5947            1 :       if (maskmode == V2DImode && !TARGET_SSE4_1)
    5948              :         {
    5949              :           /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
    5950              :              more shuffle to convert the V2DI input mask into a V4SI
    5951              :              input mask.  At which point the masking that expand_int_vcond
    5952              :              will work as desired.  */
    5953            0 :           rtx t3 = gen_reg_rtx (V4SImode);
    5954            0 :           emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
    5955              :                                         const0_rtx, const0_rtx,
    5956              :                                         const2_rtx, const2_rtx));
    5957            0 :           mask = t3;
    5958            0 :           maskmode = V4SImode;
    5959            0 :           e = w = 4;
    5960              :         }
    5961              : 
    5962            1 :       vt = gen_const_vec_duplicate (maskmode, vt);
    5963            1 :       vt = force_reg (maskmode, vt);
    5964            1 :       mask = expand_simple_binop (maskmode, AND, mask, vt,
    5965              :                                   NULL_RTX, 0, OPTAB_DIRECT);
    5966              : 
    5967            1 :       if (GET_MODE (target) != mode)
    5968            0 :         target = gen_reg_rtx (mode);
    5969            1 :       xops[0] = target;
    5970            1 :       xops[1] = gen_lowpart (mode, t2);
    5971            1 :       xops[2] = gen_lowpart (mode, t1);
    5972            1 :       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
    5973            1 :       xops[4] = mask;
    5974            1 :       xops[5] = vt;
    5975            1 :       ok = ix86_expand_int_vcond (xops);
    5976            1 :       gcc_assert (ok);
    5977            1 :       if (target != operands[0])
    5978            0 :         emit_move_insn (operands[0],
    5979            0 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5980              :     }
    5981              : }
    5982              : 
    5983              : /* Extend SRC into next wider integer vector type.  UNSIGNED_P is
    5984              :    true if we should do zero extension, else sign extension.  */
    5985              : 
    5986              : void
    5987          354 : ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
    5988              : {
    5989          354 :   machine_mode imode = GET_MODE (src);
    5990          354 :   rtx ops[3];
    5991              : 
    5992          354 :   switch (imode)
    5993              :     {
    5994          354 :     case E_V8QImode:
    5995          354 :     case E_V4QImode:
    5996          354 :     case E_V2QImode:
    5997          354 :     case E_V4HImode:
    5998          354 :     case E_V2HImode:
    5999          354 :     case E_V2SImode:
    6000          354 :       break;
    6001            0 :     default:
    6002            0 :       gcc_unreachable ();
    6003              :     }
    6004              : 
    6005          354 :   ops[0] = dest;
    6006              : 
    6007          354 :   ops[1] = force_reg (imode, src);
    6008              : 
    6009          354 :   if (unsigned_p)
    6010           97 :     ops[2] = force_reg (imode, CONST0_RTX (imode));
    6011              :   else
    6012          257 :     ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
    6013              :                                   ops[1], pc_rtx, pc_rtx);
    6014              : 
    6015          354 :   ix86_split_mmx_punpck (ops, false);
    6016          354 : }
    6017              : 
    6018              : /* Unpack SRC into the next wider integer vector type.  UNSIGNED_P is
    6019              :    true if we should do zero extension, else sign extension.  HIGH_P is
    6020              :    true if we want the N/2 high elements, else the low elements.  */
    6021              : 
    6022              : void
    6023        18736 : ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
    6024              : {
    6025        18736 :   machine_mode imode = GET_MODE (src);
    6026        18736 :   rtx tmp;
    6027              : 
    6028        18736 :   if (TARGET_SSE4_1)
    6029              :     {
    6030         6466 :       rtx (*unpack)(rtx, rtx);
    6031         6466 :       rtx (*extract)(rtx, rtx) = NULL;
    6032         6466 :       machine_mode halfmode = BLKmode;
    6033              : 
    6034         6466 :       switch (imode)
    6035              :         {
    6036          198 :         case E_V64QImode:
    6037          198 :           if (unsigned_p)
    6038              :             unpack = gen_avx512bw_zero_extendv32qiv32hi2;
    6039              :           else
    6040           64 :             unpack = gen_avx512bw_sign_extendv32qiv32hi2;
    6041          198 :           halfmode = V32QImode;
    6042          198 :           extract
    6043          198 :             = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
    6044              :           break;
    6045          711 :         case E_V32QImode:
    6046          711 :           if (unsigned_p)
    6047              :             unpack = gen_avx2_zero_extendv16qiv16hi2;
    6048              :           else
    6049          142 :             unpack = gen_avx2_sign_extendv16qiv16hi2;
    6050          711 :           halfmode = V16QImode;
    6051          711 :           extract
    6052          711 :             = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
    6053              :           break;
    6054          104 :         case E_V32HImode:
    6055          104 :           if (unsigned_p)
    6056              :             unpack = gen_avx512f_zero_extendv16hiv16si2;
    6057              :           else
    6058           64 :             unpack = gen_avx512f_sign_extendv16hiv16si2;
    6059          104 :           halfmode = V16HImode;
    6060          104 :           extract
    6061          104 :             = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
    6062              :           break;
    6063          429 :         case E_V16HImode:
    6064          429 :           if (unsigned_p)
    6065              :             unpack = gen_avx2_zero_extendv8hiv8si2;
    6066              :           else
    6067          314 :             unpack = gen_avx2_sign_extendv8hiv8si2;
    6068          429 :           halfmode = V8HImode;
    6069          429 :           extract
    6070          429 :             = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
    6071              :           break;
    6072          104 :         case E_V16SImode:
    6073          104 :           if (unsigned_p)
    6074              :             unpack = gen_avx512f_zero_extendv8siv8di2;
    6075              :           else
    6076           86 :             unpack = gen_avx512f_sign_extendv8siv8di2;
    6077          104 :           halfmode = V8SImode;
    6078          104 :           extract
    6079          104 :             = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
    6080              :           break;
    6081          382 :         case E_V8SImode:
    6082          382 :           if (unsigned_p)
    6083              :             unpack = gen_avx2_zero_extendv4siv4di2;
    6084              :           else
    6085          320 :             unpack = gen_avx2_sign_extendv4siv4di2;
    6086          382 :           halfmode = V4SImode;
    6087          382 :           extract
    6088          382 :             = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
    6089              :           break;
    6090         2597 :         case E_V16QImode:
    6091         2597 :           if (unsigned_p)
    6092              :             unpack = gen_sse4_1_zero_extendv8qiv8hi2;
    6093              :           else
    6094          270 :             unpack = gen_sse4_1_sign_extendv8qiv8hi2;
    6095              :           break;
    6096          993 :         case E_V8HImode:
    6097          993 :           if (unsigned_p)
    6098              :             unpack = gen_sse4_1_zero_extendv4hiv4si2;
    6099              :           else
    6100          776 :             unpack = gen_sse4_1_sign_extendv4hiv4si2;
    6101              :           break;
    6102          544 :         case E_V4SImode:
    6103          544 :           if (unsigned_p)
    6104              :             unpack = gen_sse4_1_zero_extendv2siv2di2;
    6105              :           else
    6106          484 :             unpack = gen_sse4_1_sign_extendv2siv2di2;
    6107              :           break;
    6108          119 :         case E_V8QImode:
    6109          119 :           if (unsigned_p)
    6110              :             unpack = gen_sse4_1_zero_extendv4qiv4hi2;
    6111              :           else
    6112           78 :             unpack = gen_sse4_1_sign_extendv4qiv4hi2;
    6113              :           break;
    6114          279 :         case E_V4HImode:
    6115          279 :           if (unsigned_p)
    6116              :             unpack = gen_sse4_1_zero_extendv2hiv2si2;
    6117              :           else
    6118          220 :             unpack = gen_sse4_1_sign_extendv2hiv2si2;
    6119              :           break;
    6120            6 :         case E_V4QImode:
    6121            6 :           if (unsigned_p)
    6122              :             unpack = gen_sse4_1_zero_extendv2qiv2hi2;
    6123              :           else
    6124            0 :             unpack = gen_sse4_1_sign_extendv2qiv2hi2;
    6125              :           break;
    6126            0 :         default:
    6127            0 :           gcc_unreachable ();
    6128              :         }
    6129              : 
    6130        12932 :       if (GET_MODE_SIZE (imode) >= 32)
    6131              :         {
    6132         1928 :           tmp = gen_reg_rtx (halfmode);
    6133         1928 :           emit_insn (extract (tmp, src));
    6134              :         }
    6135         4538 :       else if (high_p)
    6136              :         {
    6137         2352 :           switch (GET_MODE_SIZE (imode))
    6138              :             {
    6139          971 :             case 16:
    6140              :               /* Shift higher 8 bytes to lower 8 bytes.  */
    6141          971 :               tmp = gen_reg_rtx (V1TImode);
    6142          971 :               emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
    6143              :                                              GEN_INT (64)));
    6144          971 :               break;
    6145          202 :             case 8:
    6146              :               /* Shift higher 4 bytes to lower 4 bytes.  */
    6147          202 :               tmp = gen_reg_rtx (V1DImode);
    6148          202 :               emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
    6149              :                                             GEN_INT (32)));
    6150          202 :               break;
    6151            3 :             case 4:
    6152              :               /* Shift higher 2 bytes to lower 2 bytes.  */
    6153            3 :               tmp = gen_reg_rtx (V1SImode);
    6154            3 :               emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
    6155              :                                             GEN_INT (16)));
    6156            3 :               break;
    6157            0 :             default:
    6158            0 :               gcc_unreachable ();
    6159              :             }
    6160              : 
    6161         1176 :           tmp = gen_lowpart (imode, tmp);
    6162              :         }
    6163              :       else
    6164              :         tmp = src;
    6165              : 
    6166         6466 :       emit_insn (unpack (dest, tmp));
    6167              :     }
    6168              :   else
    6169              :     {
    6170        12270 :       rtx (*unpack)(rtx, rtx, rtx);
    6171              : 
    6172        12270 :       switch (imode)
    6173              :         {
    6174         3368 :         case E_V16QImode:
    6175         3368 :           if (high_p)
    6176              :             unpack = gen_vec_interleave_highv16qi;
    6177              :           else
    6178         1687 :             unpack = gen_vec_interleave_lowv16qi;
    6179              :           break;
    6180         5142 :         case E_V8HImode:
    6181         5142 :           if (high_p)
    6182              :             unpack = gen_vec_interleave_highv8hi;
    6183              :           else
    6184         2571 :             unpack = gen_vec_interleave_lowv8hi;
    6185              :           break;
    6186         2362 :         case E_V4SImode:
    6187         2362 :           if (high_p)
    6188              :             unpack = gen_vec_interleave_highv4si;
    6189              :           else
    6190         1181 :             unpack = gen_vec_interleave_lowv4si;
    6191              :           break;
    6192          478 :         case E_V8QImode:
    6193          478 :           if (high_p)
    6194              :             unpack = gen_mmx_punpckhbw;
    6195              :           else
    6196          239 :             unpack = gen_mmx_punpcklbw;
    6197              :           break;
    6198          906 :         case E_V4HImode:
    6199          906 :           if (high_p)
    6200              :             unpack = gen_mmx_punpckhwd;
    6201              :           else
    6202          453 :             unpack = gen_mmx_punpcklwd;
    6203              :           break;
    6204           14 :         case E_V4QImode:
    6205           14 :           if (high_p)
    6206              :             unpack = gen_mmx_punpckhbw_low;
    6207              :           else
    6208            7 :             unpack = gen_mmx_punpcklbw_low;
    6209              :           break;
    6210            0 :         default:
    6211            0 :           gcc_unreachable ();
    6212              :         }
    6213              : 
    6214        12270 :       if (unsigned_p)
    6215         4882 :         tmp = force_reg (imode, CONST0_RTX (imode));
    6216              :       else
    6217         7388 :         tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
    6218              :                                    src, pc_rtx, pc_rtx);
    6219              : 
    6220        12270 :       rtx tmp2 = gen_reg_rtx (imode);
    6221        12270 :       emit_insn (unpack (tmp2, src, tmp));
    6222        12270 :       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
    6223              :     }
    6224        18736 : }
    6225              : 
    6226              : /* Return true if mem is pool constant which contains a const_vector
    6227              :    perm index, assign the index to PERM.  */
    6228              : bool
    6229           35 : ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
    6230              : {
    6231           35 :   machine_mode mode = GET_MODE (mem);
    6232           35 :   int nelt = GET_MODE_NUNITS (mode);
    6233              : 
    6234           35 :   if (!INTEGRAL_MODE_P (mode))
    6235              :     return false;
    6236              : 
    6237              :     /* Needs to be constant pool.  */
    6238           35 :   if (!(MEM_P (mem))
    6239           35 :       || !SYMBOL_REF_P (XEXP (mem, 0))
    6240           70 :       || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
    6241              :    return false;
    6242              : 
    6243           35 :   rtx constant = get_pool_constant (XEXP (mem, 0));
    6244              : 
    6245           35 :   if (!CONST_VECTOR_P (constant))
    6246              :     return false;
    6247              : 
    6248              :   /* There could be some rtx like
    6249              :      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
    6250              :      but with "*.LC1" refer to V2DI constant vector.  */
    6251           35 :   if (GET_MODE (constant) != mode)
    6252              :     {
    6253            0 :       constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
    6254              : 
    6255            0 :       if (constant == nullptr || !CONST_VECTOR_P (constant))
    6256              :         return false;
    6257              :     }
    6258              : 
    6259          771 :   for (int i = 0; i != nelt; i++)
    6260          736 :     perm[i] = UINTVAL (XVECEXP (constant, 0, i));
    6261              : 
    6262              :   return true;
    6263              : }
    6264              : 
    6265              : /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
    6266              :    but works for floating pointer parameters and nonoffsetable memories.
    6267              :    For pushes, it returns just stack offsets; the values will be saved
    6268              :    in the right order.  Maximally three parts are generated.  */
    6269              : 
    6270              : static int
    6271      4129276 : ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
    6272              : {
    6273      4129276 :   int size;
    6274              : 
    6275      4129276 :   if (!TARGET_64BIT)
    6276      1561710 :     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
    6277              :   else
    6278      6695544 :     size = (GET_MODE_SIZE (mode) + 4) / 8;
    6279              : 
    6280      4129276 :   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
    6281      4129276 :   gcc_assert (size >= 2 && size <= 4);
    6282              : 
    6283              :   /* Optimize constant pool reference to immediates.  This is used by fp
    6284              :      moves, that force all constants to memory to allow combining.  */
    6285      4129276 :   if (MEM_P (operand) && MEM_READONLY_P (operand))
    6286        38080 :     operand = avoid_constant_pool_reference (operand);
    6287              : 
    6288      4129276 :   if (MEM_P (operand) && !offsettable_memref_p (operand))
    6289              :     {
    6290              :       /* The only non-offsetable memories we handle are pushes.  */
    6291       183996 :       int ok = push_operand (operand, VOIDmode);
    6292              : 
    6293       183996 :       gcc_assert (ok);
    6294              : 
    6295       183996 :       operand = copy_rtx (operand);
    6296       183996 :       PUT_MODE (operand, word_mode);
    6297       183996 :       parts[0] = parts[1] = parts[2] = parts[3] = operand;
    6298       183996 :       return size;
    6299              :     }
    6300              : 
    6301      3945280 :   if (CONST_VECTOR_P (operand))
    6302              :     {
    6303        41950 :       scalar_int_mode imode = int_mode_for_mode (mode).require ();
    6304              :       /* Caution: if we looked through a constant pool memory above,
    6305              :          the operand may actually have a different mode now.  That's
    6306              :          ok, since we want to pun this all the way back to an integer.  */
    6307        41950 :       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
    6308        41950 :       gcc_assert (operand != NULL);
    6309        41950 :       mode = imode;
    6310              :     }
    6311              : 
    6312      3945280 :   if (!TARGET_64BIT)
    6313              :     {
    6314       622928 :       if (mode == DImode)
    6315       493889 :         split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
    6316              :       else
    6317              :         {
    6318       129039 :           int i;
    6319              : 
    6320       129039 :           if (REG_P (operand))
    6321              :             {
    6322        66914 :               gcc_assert (reload_completed);
    6323       200742 :               for (i = 0; i < size; i++)
    6324       133828 :                 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
    6325              :             }
    6326        62125 :           else if (offsettable_memref_p (operand))
    6327              :             {
    6328        60777 :               operand = adjust_address (operand, SImode, 0);
    6329        60777 :               parts[0] = operand;
    6330       122074 :               for (i = 1; i < size; i++)
    6331        61297 :                 parts[i] = adjust_address (operand, SImode, 4 * i);
    6332              :             }
    6333         1348 :           else if (CONST_DOUBLE_P (operand))
    6334              :             {
    6335         1348 :               const REAL_VALUE_TYPE *r;
    6336         1348 :               long l[4];
    6337              : 
    6338         1348 :               r = CONST_DOUBLE_REAL_VALUE (operand);
    6339         1348 :               switch (mode)
    6340              :                 {
    6341            0 :                 case E_TFmode:
    6342            0 :                   real_to_target (l, r, mode);
    6343            0 :                   parts[3] = gen_int_mode (l[3], SImode);
    6344            0 :                   parts[2] = gen_int_mode (l[2], SImode);
    6345            0 :                   break;
    6346          198 :                 case E_XFmode:
    6347              :                   /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
    6348              :                      long double may not be 80-bit.  */
    6349          198 :                   real_to_target (l, r, mode);
    6350          198 :                   parts[2] = gen_int_mode (l[2], SImode);
    6351          198 :                   break;
    6352         1150 :                 case E_DFmode:
    6353         1150 :                   REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
    6354         1150 :                   break;
    6355            0 :                 default:
    6356            0 :                   gcc_unreachable ();
    6357              :                 }
    6358         1348 :               parts[1] = gen_int_mode (l[1], SImode);
    6359         1348 :               parts[0] = gen_int_mode (l[0], SImode);
    6360              :             }
    6361              :           else
    6362            0 :             gcc_unreachable ();
    6363              :         }
    6364              :     }
    6365              :   else
    6366              :     {
    6367      3322352 :       if (mode == TImode)
    6368      3302249 :         split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
    6369      3322352 :       if (mode == XFmode || mode == TFmode)
    6370              :         {
    6371        20103 :           machine_mode upper_mode = mode==XFmode ? SImode : DImode;
    6372        20103 :           if (REG_P (operand))
    6373              :             {
    6374         1491 :               gcc_assert (reload_completed);
    6375         1491 :               parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
    6376         1491 :               parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
    6377              :             }
    6378        18612 :           else if (offsettable_memref_p (operand))
    6379              :             {
    6380        14492 :               operand = adjust_address (operand, DImode, 0);
    6381        14492 :               parts[0] = operand;
    6382        14492 :               parts[1] = adjust_address (operand, upper_mode, 8);
    6383              :             }
    6384         4120 :           else if (CONST_DOUBLE_P (operand))
    6385              :             {
    6386         4120 :               long l[4];
    6387              : 
    6388         4120 :               real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
    6389              : 
    6390              :               /* real_to_target puts 32-bit pieces in each long.  */
    6391         8240 :               parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
    6392         4120 :                                        | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
    6393         4120 :                                           << 32), DImode);
    6394              : 
    6395         4120 :               if (upper_mode == SImode)
    6396         2940 :                 parts[1] = gen_int_mode (l[2], SImode);
    6397              :               else
    6398         1180 :                 parts[1]
    6399         1180 :                   = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
    6400         1180 :                                   | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
    6401         1180 :                                      << 32), DImode);
    6402              :             }
    6403              :           else
    6404            0 :             gcc_unreachable ();
    6405              :         }
    6406              :     }
    6407              : 
    6408              :   return size;
    6409              : }
    6410              : 
    6411              : /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
    6412              :    Return false when normal moves are needed; true when all required
    6413              :    insns have been emitted.  Operands 2-4 contain the input values
    6414              :    int the correct order; operands 5-7 contain the output values.  */
    6415              : 
    6416              : void
    6417      2077552 : ix86_split_long_move (rtx operands[])
    6418              : {
    6419      2077552 :   rtx part[2][4];
    6420      2077552 :   int nparts, i, j;
    6421      2077552 :   int push = 0;
    6422      2077552 :   int collisions = 0;
    6423      2077552 :   machine_mode mode = GET_MODE (operands[0]);
    6424      2077552 :   bool collisionparts[4];
    6425              : 
    6426              :   /* The DFmode expanders may ask us to move double.
    6427              :      For 64bit target this is single move.  By hiding the fact
    6428              :      here we simplify i386.md splitters.  */
    6429      3764352 :   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
    6430              :     {
    6431              :       /* Optimize constant pool reference to immediates.  This is used by
    6432              :          fp moves, that force all constants to memory to allow combining.  */
    6433              : 
    6434        12914 :       if (MEM_P (operands[1])
    6435        12499 :           && SYMBOL_REF_P (XEXP (operands[1], 0))
    6436        13520 :           && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
    6437          117 :         operands[1] = get_pool_constant (XEXP (operands[1], 0));
    6438        12914 :       if (push_operand (operands[0], VOIDmode))
    6439              :         {
    6440        12914 :           operands[0] = copy_rtx (operands[0]);
    6441        12914 :           PUT_MODE (operands[0], word_mode);
    6442              :         }
    6443              :       else
    6444            0 :         operands[0] = gen_lowpart (DImode, operands[0]);
    6445        12914 :       operands[1] = gen_lowpart (DImode, operands[1]);
    6446        12914 :       emit_move_insn (operands[0], operands[1]);
    6447        12914 :       return;
    6448              :     }
    6449              : 
    6450              :   /* The only non-offsettable memory we handle is push.  */
    6451      2064638 :   if (push_operand (operands[0], VOIDmode))
    6452              :     push = 1;
    6453              :   else
    6454      1880642 :     gcc_assert (!MEM_P (operands[0])
    6455              :                 || offsettable_memref_p (operands[0]));
    6456              : 
    6457      2064638 :   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
    6458      2064638 :   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
    6459              : 
    6460              :   /* When emitting push, take care for source operands on the stack.  */
    6461       183996 :   if (push && MEM_P (operands[1])
    6462      2161802 :       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
    6463              :     {
    6464        56279 :       rtx src_base = XEXP (part[1][nparts - 1], 0);
    6465              : 
    6466              :       /* Compensate for the stack decrement by 4.  */
    6467        56279 :       if (!TARGET_64BIT && nparts == 3
    6468        51619 :           && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
    6469            0 :         src_base = plus_constant (Pmode, src_base, 4);
    6470              : 
    6471              :       /* src_base refers to the stack pointer and is
    6472              :          automatically decreased by emitted push.  */
    6473       169116 :       for (i = 0; i < nparts; i++)
    6474       112837 :         part[1][i] = change_address (part[1][i],
    6475       112837 :                                      GET_MODE (part[1][i]), src_base);
    6476              :     }
    6477              : 
    6478              :   /* We need to do copy in the right order in case an address register
    6479              :      of the source overlaps the destination.  */
    6480      2064638 :   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
    6481              :     {
    6482              :       rtx tmp;
    6483              : 
    6484      2360874 :       for (i = 0; i < nparts; i++)
    6485              :         {
    6486      1573916 :           collisionparts[i]
    6487      1573916 :             = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
    6488      1573916 :           if (collisionparts[i])
    6489        16896 :             collisions++;
    6490              :         }
    6491              : 
    6492              :       /* Collision in the middle part can be handled by reordering.  */
    6493       786958 :       if (collisions == 1 && nparts == 3 && collisionparts [1])
    6494              :         {
    6495            0 :           std::swap (part[0][1], part[0][2]);
    6496            0 :           std::swap (part[1][1], part[1][2]);
    6497              :         }
    6498       786958 :       else if (collisions == 1
    6499       786958 :                && nparts == 4
    6500            0 :                && (collisionparts [1] || collisionparts [2]))
    6501              :         {
    6502            0 :           if (collisionparts [1])
    6503              :             {
    6504            0 :               std::swap (part[0][1], part[0][2]);
    6505            0 :               std::swap (part[1][1], part[1][2]);
    6506              :             }
    6507              :           else
    6508              :             {
    6509            0 :               std::swap (part[0][2], part[0][3]);
    6510            0 :               std::swap (part[1][2], part[1][3]);
    6511              :             }
    6512              :         }
    6513              : 
    6514              :       /* If there are more collisions, we can't handle it by reordering.
    6515              :          Do an lea to the last part and use only one colliding move.  */
    6516       786958 :       else if (collisions > 1)
    6517              :         {
    6518           84 :           rtx base, addr;
    6519              : 
    6520           84 :           collisions = 1;
    6521              : 
    6522           84 :           base = part[0][nparts - 1];
    6523              : 
    6524              :           /* Handle the case when the last part isn't valid for lea.
    6525              :              Happens in 64-bit mode storing the 12-byte XFmode.  */
    6526          126 :           if (GET_MODE (base) != Pmode)
    6527            0 :             base = gen_rtx_REG (Pmode, REGNO (base));
    6528              : 
    6529           84 :           addr = XEXP (part[1][0], 0);
    6530           84 :           if (TARGET_TLS_DIRECT_SEG_REFS)
    6531              :             {
    6532           84 :               struct ix86_address parts;
    6533           84 :               int ok = ix86_decompose_address (addr, &parts);
    6534           84 :               gcc_assert (ok);
    6535              :               /* It is not valid to use %gs: or %fs: in lea.  */
    6536           84 :               gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
    6537              :             }
    6538           84 :           emit_insn (gen_rtx_SET (base, addr));
    6539           84 :           part[1][0] = replace_equiv_address (part[1][0], base);
    6540          168 :           for (i = 1; i < nparts; i++)
    6541              :             {
    6542          168 :               tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
    6543           84 :               part[1][i] = replace_equiv_address (part[1][i], tmp);
    6544              :             }
    6545              :         }
    6546              :     }
    6547              : 
    6548      2064638 :   if (push)
    6549              :     {
    6550       183996 :       if (!TARGET_64BIT)
    6551              :         {
    6552       158576 :           if (nparts == 3)
    6553              :             {
    6554          580 :               if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
    6555            0 :                 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
    6556          580 :               emit_move_insn (part[0][2], part[1][2]);
    6557              :             }
    6558       157996 :           else if (nparts == 4)
    6559              :             {
    6560            0 :               emit_move_insn (part[0][3], part[1][3]);
    6561            0 :               emit_move_insn (part[0][2], part[1][2]);
    6562              :             }
    6563              :         }
    6564              :       else
    6565              :         {
    6566              :           /* In 64bit mode we don't have 32bit push available.  In case this is
    6567              :              register, it is OK - we will just use larger counterpart.  We also
    6568              :              retype memory - these comes from attempt to avoid REX prefix on
    6569              :              moving of second half of TFmode value.  */
    6570        25420 :           if (GET_MODE (part[1][1]) == SImode)
    6571              :             {
    6572        11245 :               switch (GET_CODE (part[1][1]))
    6573              :                 {
    6574        10811 :                 case MEM:
    6575        10811 :                   part[1][1] = adjust_address (part[1][1], DImode, 0);
    6576        10811 :                   break;
    6577              : 
    6578          434 :                 case REG:
    6579          434 :                   part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
    6580          434 :                   break;
    6581              : 
    6582            0 :                 default:
    6583            0 :                   gcc_unreachable ();
    6584              :                 }
    6585              : 
    6586        11245 :               if (GET_MODE (part[1][0]) == SImode)
    6587            0 :                 part[1][0] = part[1][1];
    6588              :             }
    6589              :         }
    6590       183996 :       emit_move_insn (part[0][1], part[1][1]);
    6591       183996 :       emit_move_insn (part[0][0], part[1][0]);
    6592       183996 :       return;
    6593              :     }
    6594              : 
    6595              :   /* Choose correct order to not overwrite the source before it is copied.  */
    6596      1880642 :   if ((REG_P (part[0][0])
    6597      1024715 :        && REG_P (part[1][1])
    6598        79878 :        && (REGNO (part[0][0]) == REGNO (part[1][1])
    6599        64716 :            || (nparts == 3
    6600            0 :                && REGNO (part[0][0]) == REGNO (part[1][2]))
    6601        64716 :            || (nparts == 4
    6602            0 :                && REGNO (part[0][0]) == REGNO (part[1][3]))))
    6603      2890195 :       || (collisions > 0
    6604        16812 :           && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
    6605              :     {
    6606        95106 :       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
    6607              :         {
    6608        63404 :           operands[2 + i] = part[0][j];
    6609        63404 :           operands[6 + i] = part[1][j];
    6610              :         }
    6611              :     }
    6612              :   else
    6613              :     {
    6614      5546889 :       for (i = 0; i < nparts; i++)
    6615              :         {
    6616      3697949 :           operands[2 + i] = part[0][i];
    6617      3697949 :           operands[6 + i] = part[1][i];
    6618              :         }
    6619              :     }
    6620              : 
    6621              :   /* Attempt to locally unCSE nonzero constants.  */
    6622      3761353 :   for (j = 0; j < nparts - 1; j++)
    6623      1880711 :     if (CONST_INT_P (operands[6 + j])
    6624       224780 :         && operands[6 + j] != const0_rtx
    6625        62905 :         && REG_P (operands[2 + j]))
    6626       111846 :       for (i = j; i < nparts - 1; i++)
    6627        55923 :         if (CONST_INT_P (operands[7 + i])
    6628        55923 :             && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
    6629        22579 :           operands[7 + i] = operands[2 + j];
    6630              : 
    6631      5641995 :   for (i = 0; i < nparts; i++)
    6632      3761353 :     emit_move_insn (operands[2 + i], operands[6 + i]);
    6633              : 
    6634              :   return;
    6635              : }
    6636              : 
    6637              : /* Helper function of ix86_split_ashl used to generate an SImode/DImode
    6638              :    left shift by a constant, either using a single shift or
    6639              :    a sequence of add instructions.  */
    6640              : 
    6641              : static void
    6642         4304 : ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
    6643              : {
    6644         4304 :   if (count == 1
    6645         4304 :       || (count * ix86_cost->add <= ix86_cost->shift_const
    6646            0 :           && !optimize_insn_for_size_p ()))
    6647              :     {
    6648           16 :       while (count-- > 0)
    6649            8 :         emit_insn (gen_add2_insn (operand, operand));
    6650              :     }
    6651              :   else
    6652              :     {
    6653         4296 :       rtx (*insn)(rtx, rtx, rtx);
    6654              : 
    6655         4296 :       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
    6656         4296 :       emit_insn (insn (operand, operand, GEN_INT (count)));
    6657              :     }
    6658         4304 : }
    6659              : 
    6660              : void
    6661        10178 : ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
    6662              : {
    6663        10178 :   rtx (*gen_ashl3)(rtx, rtx, rtx);
    6664        10178 :   rtx (*gen_shld)(rtx, rtx, rtx);
    6665        10178 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6666        10178 :   machine_mode half_mode;
    6667              : 
    6668        10178 :   rtx low[2], high[2];
    6669        10178 :   int count;
    6670              : 
    6671        10178 :   if (CONST_INT_P (operands[2]))
    6672              :     {
    6673         8471 :       split_double_mode (mode, operands, 2, low, high);
    6674         8471 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6675              : 
    6676         8471 :       if (count >= half_width)
    6677              :         {
    6678         2464 :           emit_move_insn (high[0], low[1]);
    6679         2464 :           ix86_expand_clear (low[0]);
    6680              : 
    6681         2464 :           if (count > half_width)
    6682          141 :             ix86_expand_ashl_const (high[0], count - half_width, mode);
    6683              :         }
    6684         6007 :       else if (count == 1)
    6685              :         {
    6686         1844 :           if (!rtx_equal_p (operands[0], operands[1]))
    6687            0 :             emit_move_insn (operands[0], operands[1]);
    6688         1844 :           rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
    6689         1844 :           rtx x4 = gen_rtx_LTU (mode, x3, const0_rtx);
    6690         1844 :           half_mode = mode == DImode ? SImode : DImode;
    6691         1844 :           emit_insn (gen_add3_cc_overflow_1 (half_mode, low[0],
    6692              :                                              low[0], low[0]));
    6693         1844 :           emit_insn (gen_add3_carry (half_mode, high[0], high[0], high[0],
    6694              :                                      x3, x4));
    6695              :         }
    6696              :       else
    6697              :         {
    6698         4163 :           gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
    6699              : 
    6700         4163 :           if (!rtx_equal_p (operands[0], operands[1]))
    6701            0 :             emit_move_insn (operands[0], operands[1]);
    6702              : 
    6703         4163 :           emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
    6704         4163 :           ix86_expand_ashl_const (low[0], count, mode);
    6705              :         }
    6706         8741 :       return;
    6707              :     }
    6708              : 
    6709         1707 :   split_double_mode (mode, operands, 1, low, high);
    6710         1707 :   half_mode = mode == DImode ? SImode : DImode;
    6711              : 
    6712         1707 :   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
    6713              : 
    6714         1707 :   if (operands[1] == const1_rtx)
    6715              :     {
    6716              :       /* Assuming we've chosen a QImode capable registers, then 1 << N
    6717              :          can be done with two 32/64-bit shifts, no branches, no cmoves.  */
    6718          270 :       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
    6719              :         {
    6720          162 :           rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
    6721              : 
    6722          162 :           ix86_expand_clear (low[0]);
    6723          162 :           ix86_expand_clear (high[0]);
    6724          162 :           emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
    6725              : 
    6726          162 :           d = gen_lowpart (QImode, low[0]);
    6727          162 :           d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
    6728          162 :           s = gen_rtx_EQ (QImode, flags, const0_rtx);
    6729          162 :           emit_insn (gen_rtx_SET (d, s));
    6730              : 
    6731          162 :           d = gen_lowpart (QImode, high[0]);
    6732          162 :           d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
    6733          162 :           s = gen_rtx_NE (QImode, flags, const0_rtx);
    6734          162 :           emit_insn (gen_rtx_SET (d, s));
    6735              :         }
    6736              : 
    6737              :       /* Otherwise, we can get the same results by manually performing
    6738              :          a bit extract operation on bit 5/6, and then performing the two
    6739              :          shifts.  The two methods of getting 0/1 into low/high are exactly
    6740              :          the same size.  Avoiding the shift in the bit extract case helps
    6741              :          pentium4 a bit; no one else seems to care much either way.  */
    6742              :       else
    6743              :         {
    6744          108 :           rtx (*gen_lshr3)(rtx, rtx, rtx);
    6745          108 :           rtx (*gen_and3)(rtx, rtx, rtx);
    6746          108 :           rtx (*gen_xor3)(rtx, rtx, rtx);
    6747          108 :           HOST_WIDE_INT bits;
    6748          108 :           rtx x;
    6749              : 
    6750          108 :           if (mode == DImode)
    6751              :             {
    6752              :               gen_lshr3 = gen_lshrsi3;
    6753              :               gen_and3 = gen_andsi3;
    6754              :               gen_xor3 = gen_xorsi3;
    6755              :               bits = 5;
    6756              :             }
    6757              :           else
    6758              :             {
    6759            0 :               gen_lshr3 = gen_lshrdi3;
    6760            0 :               gen_and3 = gen_anddi3;
    6761            0 :               gen_xor3 = gen_xordi3;
    6762            0 :               bits = 6;
    6763              :             }
    6764              : 
    6765          108 :           if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
    6766            0 :             x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
    6767              :           else
    6768          108 :             x = gen_lowpart (half_mode, operands[2]);
    6769          108 :           emit_insn (gen_rtx_SET (high[0], x));
    6770              : 
    6771          108 :           emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
    6772          108 :           emit_insn (gen_and3 (high[0], high[0], const1_rtx));
    6773          108 :           emit_move_insn (low[0], high[0]);
    6774          108 :           emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
    6775              :         }
    6776              : 
    6777          270 :       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
    6778          270 :       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
    6779          270 :       return;
    6780              :     }
    6781              : 
    6782         1437 :   if (operands[1] == constm1_rtx)
    6783              :     {
    6784              :       /* For -1 << N, we can avoid the shld instruction, because we
    6785              :          know that we're shifting 0...31/63 ones into a -1.  */
    6786          117 :       emit_move_insn (low[0], constm1_rtx);
    6787          117 :       if (optimize_insn_for_size_p ())
    6788            6 :         emit_move_insn (high[0], low[0]);
    6789              :       else
    6790          111 :         emit_move_insn (high[0], constm1_rtx);
    6791              :     }
    6792              :   else
    6793              :     {
    6794         1320 :       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
    6795              : 
    6796         1320 :       if (!rtx_equal_p (operands[0], operands[1]))
    6797            0 :         emit_move_insn (operands[0], operands[1]);
    6798              : 
    6799         1320 :       split_double_mode (mode, operands, 1, low, high);
    6800         1320 :       emit_insn (gen_shld (high[0], low[0], operands[2]));
    6801              :     }
    6802              : 
    6803         1437 :   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
    6804              : 
    6805         1437 :   if (TARGET_CMOVE && scratch)
    6806              :     {
    6807          966 :       ix86_expand_clear (scratch);
    6808          966 :       emit_insn (gen_x86_shift_adj_1
    6809              :                  (half_mode, high[0], low[0], operands[2], scratch));
    6810              :     }
    6811              :   else
    6812          471 :     emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
    6813              : }
    6814              : 
    6815              : void
    6816         6038 : ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
    6817              : {
    6818         4798 :   rtx (*gen_ashr3)(rtx, rtx, rtx)
    6819         6038 :     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
    6820         6038 :   rtx (*gen_shrd)(rtx, rtx, rtx);
    6821         6038 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6822              : 
    6823         6038 :   rtx low[2], high[2];
    6824         6038 :   int count;
    6825              : 
    6826         6038 :   if (CONST_INT_P (operands[2]))
    6827              :     {
    6828         5861 :       split_double_mode (mode, operands, 2, low, high);
    6829         5861 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6830              : 
    6831        11722 :       if (count == GET_MODE_BITSIZE (mode) - 1)
    6832              :         {
    6833           85 :           emit_move_insn (high[0], high[1]);
    6834           85 :           emit_insn (gen_ashr3 (high[0], high[0],
    6835           85 :                                 GEN_INT (half_width - 1)));
    6836           85 :           emit_move_insn (low[0], high[0]);
    6837              : 
    6838              :         }
    6839         5776 :       else if (count >= half_width)
    6840              :         {
    6841         1619 :           emit_move_insn (low[0], high[1]);
    6842         1619 :           emit_move_insn (high[0], low[0]);
    6843         1619 :           emit_insn (gen_ashr3 (high[0], high[0],
    6844         1619 :                                 GEN_INT (half_width - 1)));
    6845              : 
    6846         1619 :           if (count > half_width)
    6847           38 :             emit_insn (gen_ashr3 (low[0], low[0],
    6848           38 :                                   GEN_INT (count - half_width)));
    6849              :         }
    6850         4157 :       else if (count == 1
    6851          766 :                && (TARGET_USE_RCR || optimize_size > 1))
    6852              :         {
    6853            1 :           if (!rtx_equal_p (operands[0], operands[1]))
    6854            0 :             emit_move_insn (operands[0], operands[1]);
    6855            1 :           if (mode == DImode)
    6856              :             {
    6857            0 :               emit_insn (gen_ashrsi3_carry (high[0], high[0]));
    6858            0 :               emit_insn (gen_rcrsi2 (low[0], low[0]));
    6859              :             }
    6860              :           else
    6861              :             {
    6862            1 :               emit_insn (gen_ashrdi3_carry (high[0], high[0]));
    6863            1 :               emit_insn (gen_rcrdi2 (low[0], low[0]));
    6864              :             }
    6865              :         }
    6866              :       else
    6867              :         {
    6868         4156 :           gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6869              : 
    6870         4156 :           if (!rtx_equal_p (operands[0], operands[1]))
    6871            0 :             emit_move_insn (operands[0], operands[1]);
    6872              : 
    6873         4156 :           emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
    6874         4156 :           emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
    6875              :         }
    6876              :     }
    6877              :   else
    6878              :     {
    6879          177 :       machine_mode half_mode;
    6880              : 
    6881          177 :       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6882              : 
    6883          177 :      if (!rtx_equal_p (operands[0], operands[1]))
    6884            0 :         emit_move_insn (operands[0], operands[1]);
    6885              : 
    6886          177 :       split_double_mode (mode, operands, 1, low, high);
    6887          177 :       half_mode = mode == DImode ? SImode : DImode;
    6888              : 
    6889          177 :       emit_insn (gen_shrd (low[0], high[0], operands[2]));
    6890          177 :       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
    6891              : 
    6892          177 :       if (TARGET_CMOVE && scratch)
    6893              :         {
    6894          139 :           emit_move_insn (scratch, high[0]);
    6895          139 :           emit_insn (gen_ashr3 (scratch, scratch,
    6896          139 :                                 GEN_INT (half_width - 1)));
    6897          139 :           emit_insn (gen_x86_shift_adj_1
    6898              :                      (half_mode, low[0], high[0], operands[2], scratch));
    6899              :         }
    6900              :       else
    6901           38 :         emit_insn (gen_x86_shift_adj_3
    6902              :                    (half_mode, low[0], high[0], operands[2]));
    6903              :     }
    6904         6038 : }
    6905              : 
    6906              : void
    6907        13235 : ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
    6908              : {
    6909         5892 :   rtx (*gen_lshr3)(rtx, rtx, rtx)
    6910        13235 :     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
    6911        13235 :   rtx (*gen_shrd)(rtx, rtx, rtx);
    6912        13235 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6913              : 
    6914        13235 :   rtx low[2], high[2];
    6915        13235 :   int count;
    6916              : 
    6917        13235 :   if (CONST_INT_P (operands[2]))
    6918              :     {
    6919        11833 :       split_double_mode (mode, operands, 2, low, high);
    6920        11833 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6921              : 
    6922        11833 :       if (count >= half_width)
    6923              :         {
    6924         8471 :           emit_move_insn (low[0], high[1]);
    6925         8471 :           ix86_expand_clear (high[0]);
    6926              : 
    6927         8471 :           if (count > half_width)
    6928          651 :             emit_insn (gen_lshr3 (low[0], low[0],
    6929          651 :                                   GEN_INT (count - half_width)));
    6930              :         }
    6931         3362 :       else if (count == 1
    6932          678 :                && (TARGET_USE_RCR || optimize_size > 1))
    6933              :         {
    6934            1 :           if (!rtx_equal_p (operands[0], operands[1]))
    6935            0 :             emit_move_insn (operands[0], operands[1]);
    6936            1 :           if (mode == DImode)
    6937              :             {
    6938            0 :               emit_insn (gen_lshrsi3_carry (high[0], high[0]));
    6939            0 :               emit_insn (gen_rcrsi2 (low[0], low[0]));
    6940              :             }
    6941              :           else
    6942              :             {
    6943            1 :               emit_insn (gen_lshrdi3_carry (high[0], high[0]));
    6944            1 :               emit_insn (gen_rcrdi2 (low[0], low[0]));
    6945              :             }
    6946              :         }
    6947              :       else
    6948              :         {
    6949         3361 :           gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6950              : 
    6951         3361 :           if (!rtx_equal_p (operands[0], operands[1]))
    6952            0 :             emit_move_insn (operands[0], operands[1]);
    6953              : 
    6954         3361 :           emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
    6955         3361 :           emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
    6956              :         }
    6957              :     }
    6958              :   else
    6959              :     {
    6960         1402 :       machine_mode half_mode;
    6961              : 
    6962         1402 :       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6963              : 
    6964         1402 :       if (!rtx_equal_p (operands[0], operands[1]))
    6965            0 :         emit_move_insn (operands[0], operands[1]);
    6966              : 
    6967         1402 :       split_double_mode (mode, operands, 1, low, high);
    6968         1402 :       half_mode = mode == DImode ? SImode : DImode;
    6969              : 
    6970         1402 :       emit_insn (gen_shrd (low[0], high[0], operands[2]));
    6971         1402 :       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
    6972              : 
    6973         1402 :       if (TARGET_CMOVE && scratch)
    6974              :         {
    6975         1132 :           ix86_expand_clear (scratch);
    6976         1132 :           emit_insn (gen_x86_shift_adj_1
    6977              :                      (half_mode, low[0], high[0], operands[2], scratch));
    6978              :         }
    6979              :       else
    6980          270 :         emit_insn (gen_x86_shift_adj_2
    6981              :                    (half_mode, low[0], high[0], operands[2]));
    6982              :     }
    6983        13235 : }
    6984              : 
    6985              : /* Helper function to split TImode ashl under NDD.  */
    6986              : void
    6987            1 : ix86_split_ashl_ndd (rtx *operands, rtx scratch)
    6988              : {
    6989            1 :   gcc_assert (TARGET_APX_NDD);
    6990            1 :   int half_width = GET_MODE_BITSIZE (TImode) >> 1;
    6991              : 
    6992            1 :   rtx low[2], high[2];
    6993            1 :   int count;
    6994              : 
    6995            1 :   split_double_mode (TImode, operands, 2, low, high);
    6996            1 :   if (CONST_INT_P (operands[2]))
    6997              :     {
    6998            0 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
    6999              : 
    7000            0 :       if (count >= half_width)
    7001              :         {
    7002            0 :           count = count - half_width;
    7003            0 :           if (count == 0)
    7004              :             {
    7005            0 :               if (!rtx_equal_p (high[0], low[1]))
    7006            0 :                 emit_move_insn (high[0], low[1]);
    7007              :             }
    7008            0 :           else if (count == 1)
    7009            0 :             emit_insn (gen_adddi3 (high[0], low[1], low[1]));
    7010              :           else
    7011            0 :             emit_insn (gen_ashldi3 (high[0], low[1], GEN_INT (count)));
    7012              : 
    7013            0 :           ix86_expand_clear (low[0]);
    7014              :         }
    7015            0 :       else if (count == 1)
    7016              :         {
    7017            0 :           rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
    7018            0 :           rtx x4 = gen_rtx_LTU (TImode, x3, const0_rtx);
    7019            0 :           emit_insn (gen_add3_cc_overflow_1 (DImode, low[0],
    7020              :                                              low[1], low[1]));
    7021            0 :           emit_insn (gen_add3_carry (DImode, high[0], high[1], high[1],
    7022              :                                      x3, x4));
    7023              :         }
    7024              :       else
    7025              :         {
    7026            0 :           emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
    7027              :                                           GEN_INT (count)));
    7028            0 :           emit_insn (gen_ashldi3 (low[0], low[1], GEN_INT (count)));
    7029              :         }
    7030              :     }
    7031              :   else
    7032              :     {
    7033            1 :       emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
    7034              :                                       operands[2]));
    7035            1 :       emit_insn (gen_ashldi3 (low[0], low[1], operands[2]));
    7036            1 :       if (TARGET_CMOVE && scratch)
    7037              :         {
    7038            1 :           ix86_expand_clear (scratch);
    7039            1 :           emit_insn (gen_x86_shift_adj_1
    7040              :                      (DImode, high[0], low[0], operands[2], scratch));
    7041              :         }
    7042              :       else
    7043            0 :         emit_insn (gen_x86_shift_adj_2 (DImode, high[0], low[0], operands[2]));
    7044              :     }
    7045            1 : }
    7046              : 
    7047              : /* Helper function to split TImode l/ashr under NDD.  */
    7048              : void
    7049            2 : ix86_split_rshift_ndd (enum rtx_code code, rtx *operands, rtx scratch)
    7050              : {
    7051            2 :   gcc_assert (TARGET_APX_NDD);
    7052            2 :   int half_width = GET_MODE_BITSIZE (TImode) >> 1;
    7053            2 :   bool ashr_p = code == ASHIFTRT;
    7054            2 :   rtx (*gen_shr)(rtx, rtx, rtx) = ashr_p ? gen_ashrdi3
    7055              :                                          : gen_lshrdi3;
    7056              : 
    7057            2 :   rtx low[2], high[2];
    7058            2 :   int count;
    7059              : 
    7060            2 :   split_double_mode (TImode, operands, 2, low, high);
    7061            2 :   if (CONST_INT_P (operands[2]))
    7062              :     {
    7063            0 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
    7064              : 
    7065            0 :       if (ashr_p && (count == GET_MODE_BITSIZE (TImode) - 1))
    7066              :         {
    7067            0 :           emit_insn (gen_shr (high[0], high[1],
    7068              :                               GEN_INT (half_width - 1)));
    7069            0 :           emit_move_insn (low[0], high[0]);
    7070              :         }
    7071            0 :       else if (count >= half_width)
    7072              :         {
    7073            0 :           if (ashr_p)
    7074            0 :             emit_insn (gen_shr (high[0], high[1],
    7075              :                                 GEN_INT (half_width - 1)));
    7076              :           else
    7077            0 :             ix86_expand_clear (high[0]);
    7078              : 
    7079            0 :           if (count > half_width)
    7080            0 :             emit_insn (gen_shr (low[0], high[1],
    7081            0 :                                 GEN_INT (count - half_width)));
    7082              :           else
    7083            0 :             emit_move_insn (low[0], high[1]);
    7084              :         }
    7085              :       else
    7086              :         {
    7087            0 :           emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
    7088              :                                           GEN_INT (count)));
    7089            0 :           emit_insn (gen_shr (high[0], high[1], GEN_INT (count)));
    7090              :         }
    7091              :     }
    7092              :   else
    7093              :     {
    7094            2 :       emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
    7095              :                                       operands[2]));
    7096            2 :       emit_insn (gen_shr (high[0], high[1], operands[2]));
    7097              : 
    7098            2 :       if (TARGET_CMOVE && scratch)
    7099              :         {
    7100            2 :           if (ashr_p)
    7101              :             {
    7102            1 :               emit_move_insn (scratch, high[0]);
    7103            1 :               emit_insn (gen_shr (scratch, scratch,
    7104              :                                   GEN_INT (half_width - 1)));
    7105              :             }
    7106              :           else
    7107            1 :             ix86_expand_clear (scratch);
    7108              : 
    7109            2 :           emit_insn (gen_x86_shift_adj_1
    7110              :                      (DImode, low[0], high[0], operands[2], scratch));
    7111              :         }
    7112            0 :       else if (ashr_p)
    7113            0 :         emit_insn (gen_x86_shift_adj_3
    7114              :                    (DImode, low[0], high[0], operands[2]));
    7115              :       else
    7116            0 :         emit_insn (gen_x86_shift_adj_2
    7117              :                    (DImode, low[0], high[0], operands[2]));
    7118              :     }
    7119            2 : }
    7120              : 
    7121              : /* Expand move of V1TI mode register X to a new TI mode register.  */
    7122              : static rtx
    7123           17 : ix86_expand_v1ti_to_ti (rtx x)
    7124              : {
    7125           17 :   rtx result = gen_reg_rtx (TImode);
    7126           17 :   if (TARGET_SSE2)
    7127              :     {
    7128           17 :       rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
    7129           17 :       rtx lo = gen_lowpart (DImode, result);
    7130           17 :       emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
    7131           17 :       rtx hi = gen_highpart (DImode, result);
    7132           17 :       emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
    7133              :     }
    7134              :   else
    7135            0 :     emit_move_insn (result, gen_lowpart (TImode, x));
    7136           17 :   return result;
    7137              : }
    7138              : 
    7139              : /* Expand move of TI mode register X to a new V1TI mode register.  */
    7140              : static rtx
    7141           17 : ix86_expand_ti_to_v1ti (rtx x)
    7142              : {
    7143           17 :   if (TARGET_SSE2)
    7144              :     {
    7145           17 :       rtx lo = gen_lowpart (DImode, x);
    7146           17 :       rtx hi = gen_highpart (DImode, x);
    7147           17 :       rtx tmp = gen_reg_rtx (V2DImode);
    7148           17 :       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
    7149           17 :       return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
    7150              :     }
    7151              : 
    7152            0 :   return force_reg (V1TImode, gen_lowpart (V1TImode, x));
    7153              : }
    7154              : 
    7155              : /* Expand V1TI mode shift (of rtx_code CODE) by constant.  */
    7156              : void
    7157           42 : ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
    7158              : {
    7159           42 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7160              : 
    7161           42 :   if (!CONST_INT_P (operands[2]))
    7162              :     {
    7163            6 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7164            6 :       rtx tmp2 = gen_reg_rtx (TImode);
    7165            3 :       rtx (*shift) (rtx, rtx, rtx)
    7166            6 :             = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
    7167            6 :       emit_insn (shift (tmp2, tmp1, operands[2]));
    7168            6 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7169            6 :       emit_move_insn (operands[0], tmp3);
    7170            6 :       return;
    7171              :     }
    7172              : 
    7173           36 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7174              : 
    7175           36 :   if (bits == 0)
    7176              :     {
    7177            0 :       emit_move_insn (operands[0], op1);
    7178            0 :       return;
    7179              :     }
    7180              : 
    7181           36 :   if ((bits & 7) == 0)
    7182              :     {
    7183            0 :       rtx tmp = gen_reg_rtx (V1TImode);
    7184            0 :       if (code == ASHIFT)
    7185            0 :         emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
    7186              :       else
    7187            0 :         emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
    7188            0 :       emit_move_insn (operands[0], tmp);
    7189            0 :       return;
    7190              :     }
    7191              : 
    7192           36 :   rtx tmp1 = gen_reg_rtx (V1TImode);
    7193           36 :   if (code == ASHIFT)
    7194           18 :     emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
    7195              :   else
    7196           18 :     emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
    7197              : 
    7198              :   /* tmp2 is operands[1] shifted by 64, in V2DImode.  */
    7199           36 :   rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7200              : 
    7201              :   /* tmp3 will be the V2DImode result.  */
    7202           36 :   rtx tmp3 = gen_reg_rtx (V2DImode);
    7203              : 
    7204           36 :   if (bits > 64)
    7205              :     {
    7206           18 :       if (code == ASHIFT)
    7207            9 :         emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
    7208              :       else
    7209            9 :         emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
    7210              :     }
    7211              :   else
    7212              :     {
    7213              :       /* tmp4 is operands[1], in V2DImode.  */
    7214           18 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7215              : 
    7216           18 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7217           18 :       if (code == ASHIFT)
    7218            9 :         emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7219              :       else
    7220            9 :         emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7221              : 
    7222           18 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7223           18 :       if (code == ASHIFT)
    7224            9 :         emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
    7225              :       else
    7226            9 :         emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
    7227              : 
    7228           18 :       emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
    7229              :     }
    7230              : 
    7231              :   /* Convert the result back to V1TImode and store in operands[0].  */
    7232           36 :   rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7233           36 :   emit_move_insn (operands[0], tmp7);
    7234              : }
    7235              : 
    7236              : /* Expand V1TI mode rotate (of rtx_code CODE) by constant.  */
    7237              : void
    7238           39 : ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
    7239              : {
    7240           39 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7241              : 
    7242           39 :   if (!CONST_INT_P (operands[2]))
    7243              :     {
    7244            8 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7245            8 :       rtx tmp2 = gen_reg_rtx (TImode);
    7246            4 :       rtx (*rotate) (rtx, rtx, rtx)
    7247            8 :             = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
    7248            8 :       emit_insn (rotate (tmp2, tmp1, operands[2]));
    7249            8 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7250            8 :       emit_move_insn (operands[0], tmp3);
    7251            8 :       return;
    7252              :     }
    7253              : 
    7254           31 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7255              : 
    7256           31 :   if (bits == 0)
    7257              :     {
    7258            0 :       emit_move_insn (operands[0], op1);
    7259            0 :       return;
    7260              :     }
    7261              : 
    7262           31 :   if (code == ROTATERT)
    7263           16 :     bits = 128 - bits;
    7264              : 
    7265           31 :   if ((bits & 31) == 0)
    7266              :     {
    7267            5 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7268            5 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7269            5 :       if (bits == 32)
    7270            1 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
    7271            4 :       else if (bits == 64)
    7272            2 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
    7273              :       else
    7274            2 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
    7275            5 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
    7276            5 :       return;
    7277              :     }
    7278              : 
    7279           26 :   if ((bits & 7) == 0)
    7280              :     {
    7281            6 :       rtx tmp1 = gen_reg_rtx (V1TImode);
    7282            6 :       rtx tmp2 = gen_reg_rtx (V1TImode);
    7283            6 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7284              : 
    7285            6 :       emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
    7286            6 :       emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
    7287            6 :       emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
    7288            6 :       emit_move_insn (operands[0], tmp3);
    7289            6 :       return;
    7290              :     }
    7291              : 
    7292           20 :   rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7293              : 
    7294           20 :   rtx lobits;
    7295           20 :   rtx hibits;
    7296              : 
    7297           20 :   switch (bits >> 5)
    7298              :     {
    7299            7 :     case 0:
    7300            7 :       lobits = op1_v4si;
    7301            7 :       hibits = gen_reg_rtx (V4SImode);
    7302            7 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
    7303            7 :       break;
    7304              : 
    7305            2 :     case 1:
    7306            2 :       lobits = gen_reg_rtx (V4SImode);
    7307            2 :       hibits = gen_reg_rtx (V4SImode);
    7308            2 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
    7309            2 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
    7310            2 :       break;
    7311              : 
    7312            2 :     case 2:
    7313            2 :       lobits = gen_reg_rtx (V4SImode);
    7314            2 :       hibits = gen_reg_rtx (V4SImode);
    7315            2 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
    7316            2 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
    7317            2 :       break;
    7318              : 
    7319            9 :     default:
    7320            9 :       lobits = gen_reg_rtx (V4SImode);
    7321            9 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
    7322            9 :       hibits = op1_v4si;
    7323            9 :       break;
    7324              :     }
    7325              : 
    7326           20 :   rtx tmp1 = gen_reg_rtx (V4SImode);
    7327           20 :   rtx tmp2 = gen_reg_rtx (V4SImode);
    7328           20 :   rtx tmp3 = gen_reg_rtx (V4SImode);
    7329              : 
    7330           20 :   emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
    7331           20 :   emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
    7332           20 :   emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
    7333              : 
    7334           20 :   emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
    7335              : }
    7336              : 
    7337              : /* Expand V1TI mode ashiftrt by constant.  */
    7338              : void
    7339          109 : ix86_expand_v1ti_ashiftrt (rtx operands[])
    7340              : {
    7341          109 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7342              : 
    7343          109 :   if (!CONST_INT_P (operands[2]))
    7344              :     {
    7345            3 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7346            3 :       rtx tmp2 = gen_reg_rtx (TImode);
    7347            3 :       emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
    7348            3 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7349            3 :       emit_move_insn (operands[0], tmp3);
    7350            3 :       return;
    7351              :     }
    7352              : 
    7353          106 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7354              : 
    7355          106 :   if (bits == 0)
    7356              :     {
    7357            0 :       emit_move_insn (operands[0], op1);
    7358            0 :       return;
    7359              :     }
    7360              : 
    7361          106 :   if (bits == 127)
    7362              :     {
    7363              :       /* Two operations.  */
    7364            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7365            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7366            3 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7367              : 
    7368            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7369            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7370              : 
    7371            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
    7372            3 :       return;
    7373              :     }
    7374              : 
    7375          103 :   if (bits == 64)
    7376              :     {
    7377              :       /* Three operations.  */
    7378            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7379            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7380            3 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7381              : 
    7382            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7383            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7384              : 
    7385            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7386            3 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7387            3 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7388            3 :       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
    7389              : 
    7390            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7391            3 :       return;
    7392              :     }
    7393              : 
    7394          100 :   if (bits == 96)
    7395              :     {
    7396              :       /* Three operations.  */
    7397            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7398            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7399            3 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
    7400              : 
    7401            3 :       rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7402            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
    7403            3 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7404            3 :       emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
    7405              : 
    7406            3 :       rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
    7407            3 :       rtx tmp7 = gen_reg_rtx (V4SImode);
    7408            3 :       emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
    7409              : 
    7410            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
    7411            3 :       return;
    7412              :     }
    7413              : 
    7414           97 :   if (bits >= 111)
    7415              :     {
    7416              :       /* Three operations.  */
    7417           21 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7418           21 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7419           21 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
    7420              : 
    7421           21 :       rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7422           21 :       rtx tmp4 = gen_reg_rtx (V8HImode);
    7423           21 :       emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
    7424              : 
    7425           21 :       rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
    7426           21 :       rtx tmp6 = gen_reg_rtx (V4SImode);
    7427           21 :       emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
    7428              : 
    7429           21 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7430           21 :       return;
    7431              :     }
    7432              : 
    7433           76 :   if (TARGET_AVX2 || TARGET_SSE4_1)
    7434              :     {
    7435              :       /* Three operations.  */
    7436           50 :       if (bits == 32)
    7437              :         {
    7438            2 :           rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7439            2 :           rtx tmp2 = gen_reg_rtx (V4SImode);
    7440            2 :           emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
    7441              : 
    7442            2 :           rtx tmp3 = gen_reg_rtx (V1TImode);
    7443            2 :           emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
    7444              : 
    7445            2 :           if (TARGET_AVX2)
    7446              :             {
    7447            1 :               rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
    7448            1 :               rtx tmp5 = gen_reg_rtx (V4SImode);
    7449            1 :               emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
    7450              :                                                GEN_INT (7)));
    7451              : 
    7452            1 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
    7453              :             }
    7454              :           else
    7455              :             {
    7456            1 :               rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7457            1 :               rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7458            1 :               rtx tmp6 = gen_reg_rtx (V8HImode);
    7459            1 :               emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
    7460              :                                              GEN_INT (0x3f)));
    7461              : 
    7462            1 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7463              :             }
    7464            2 :           return;
    7465              :         }
    7466              : 
    7467              :       /* Three operations.  */
    7468           48 :       if (bits == 8 || bits == 16 || bits == 24)
    7469              :         {
    7470            6 :           rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7471            6 :           rtx tmp2 = gen_reg_rtx (V4SImode);
    7472            6 :           emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7473              : 
    7474            6 :           rtx tmp3 = gen_reg_rtx (V1TImode);
    7475            6 :           emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
    7476              : 
    7477            6 :           if (TARGET_AVX2)
    7478              :             {
    7479            3 :               rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
    7480            3 :               rtx tmp5 = gen_reg_rtx (V4SImode);
    7481            3 :               emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
    7482              :                                                GEN_INT (7)));
    7483              : 
    7484            3 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
    7485              :             }
    7486              :           else
    7487              :             {
    7488            3 :               rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7489            3 :               rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7490            3 :               rtx tmp6 = gen_reg_rtx (V8HImode);
    7491            3 :               emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
    7492              :                                              GEN_INT (0x3f)));
    7493              : 
    7494            3 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7495              :             }
    7496            6 :           return;
    7497              :         }
    7498              :     }
    7499              : 
    7500           68 :   if (bits > 96)
    7501              :     {
    7502              :       /* Four operations.  */
    7503            3 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7504            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7505            3 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
    7506              : 
    7507            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7508            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
    7509              : 
    7510            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
    7511            3 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7512            3 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7513            3 :       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
    7514              : 
    7515            3 :       rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
    7516            3 :       rtx tmp8 = gen_reg_rtx (V4SImode);
    7517            3 :       emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
    7518              : 
    7519            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
    7520            3 :       return;
    7521              :     }
    7522              : 
    7523           65 :   if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
    7524              :     {
    7525              :       /* Four operations.  */
    7526            4 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7527            4 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7528            4 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7529              : 
    7530            4 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7531            4 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7532              : 
    7533            4 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7534            4 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
    7535              : 
    7536            4 :       rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7537            4 :       rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
    7538            4 :       rtx tmp7 = gen_reg_rtx (V8HImode);
    7539            6 :       emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
    7540              :                                      GEN_INT (bits == 48 ? 0x1f : 0x07)));
    7541              : 
    7542            4 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
    7543            4 :       return;
    7544              :     }
    7545              : 
    7546           61 :   if ((bits & 7) == 0)
    7547              :     {
    7548              :       /* Five operations.  */
    7549            9 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7550            9 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7551            9 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7552              : 
    7553            9 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7554            9 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7555              : 
    7556            9 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7557            9 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
    7558              : 
    7559            9 :       rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7560            9 :       rtx tmp6 = gen_reg_rtx (V1TImode);
    7561            9 :       emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
    7562              : 
    7563            9 :       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7564            9 :       rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
    7565            9 :       rtx tmp9 = gen_reg_rtx (V2DImode);
    7566            9 :       emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
    7567              : 
    7568            9 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
    7569            9 :       return;
    7570              :     }
    7571              : 
    7572           52 :   if (TARGET_AVX2 && bits < 32)
    7573              :     {
    7574              :       /* Six operations.  */
    7575            9 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7576            9 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7577            9 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7578              : 
    7579            9 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7580            9 :       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
    7581              : 
    7582            9 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7583            9 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7584            9 :       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7585              : 
    7586            9 :       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7587            9 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7588            9 :       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
    7589              : 
    7590            9 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7591            9 :       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
    7592              : 
    7593            9 :       rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
    7594            9 :       rtx tmp10 = gen_reg_rtx (V4SImode);
    7595            9 :       emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
    7596              : 
    7597            9 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
    7598            9 :       return;
    7599              :     }
    7600              : 
    7601           43 :   if (TARGET_SSE4_1 && bits < 15)
    7602              :     {
    7603              :       /* Six operations.  */
    7604            4 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7605            4 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7606            4 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7607              : 
    7608            4 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7609            4 :       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
    7610              : 
    7611            4 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7612            4 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7613            4 :       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7614              : 
    7615            4 :       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7616            4 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7617            4 :       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
    7618              : 
    7619            4 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7620            4 :       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
    7621              : 
    7622            4 :       rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7623            4 :       rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
    7624            4 :       rtx tmp11 = gen_reg_rtx (V8HImode);
    7625            4 :       emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
    7626              : 
    7627            4 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
    7628            4 :       return;
    7629              :     }
    7630              : 
    7631           18 :   if (bits == 1)
    7632              :     {
    7633              :       /* Eight operations.  */
    7634            1 :       rtx tmp1 = gen_reg_rtx (V1TImode);
    7635            1 :       emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
    7636              : 
    7637            1 :       rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7638            1 :       rtx tmp3 = gen_reg_rtx (V2DImode);
    7639            1 :       emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
    7640              : 
    7641            1 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7642            1 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7643            1 :       emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
    7644              : 
    7645            1 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7646            1 :       emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
    7647              : 
    7648            1 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7649            1 :       emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
    7650              : 
    7651            1 :       rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
    7652            1 :       rtx tmp9 = gen_reg_rtx (V4SImode);
    7653            1 :       emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
    7654              : 
    7655            1 :       rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
    7656            1 :       rtx tmp11 = gen_reg_rtx (V2DImode);
    7657            1 :       emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
    7658              : 
    7659            1 :       rtx tmp12 = gen_reg_rtx (V2DImode);
    7660            1 :       emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
    7661              : 
    7662            1 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
    7663            1 :       return;
    7664              :     }
    7665              : 
    7666           38 :   if (bits > 64)
    7667              :     {
    7668              :       /* Eight operations.  */
    7669           12 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7670           12 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7671           12 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7672              : 
    7673           12 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7674           12 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7675              : 
    7676           12 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7677           12 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
    7678              : 
    7679           12 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7680           12 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7681           12 :       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
    7682              : 
    7683           12 :       rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7684           12 :       rtx tmp8 = gen_reg_rtx (V1TImode);
    7685           12 :       emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
    7686              : 
    7687           12 :       rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7688           12 :       rtx tmp10 = gen_reg_rtx (V2DImode);
    7689           12 :       emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
    7690              : 
    7691           12 :       rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
    7692           12 :       rtx tmp12 = gen_reg_rtx (V2DImode);
    7693           12 :       emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
    7694              : 
    7695           12 :       rtx tmp13 = gen_reg_rtx (V2DImode);
    7696           12 :       emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
    7697              : 
    7698           12 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
    7699              :     }
    7700              :   else
    7701              :     {
    7702              :       /* Nine operations.  */
    7703           26 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7704           26 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7705           26 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7706              : 
    7707           26 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7708           26 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7709              : 
    7710           26 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7711           26 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
    7712              : 
    7713           26 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7714           26 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7715           26 :       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
    7716              : 
    7717           26 :       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7718           26 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7719           26 :       emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
    7720              : 
    7721           26 :       rtx tmp9 = gen_reg_rtx (V2DImode);
    7722           26 :       emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
    7723              : 
    7724           26 :       rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7725           26 :       rtx tmp11 = gen_reg_rtx (V1TImode);
    7726           26 :       emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
    7727              : 
    7728           26 :       rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
    7729           26 :       rtx tmp13 = gen_reg_rtx (V2DImode);
    7730           26 :       emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
    7731              : 
    7732           26 :       rtx tmp14 = gen_reg_rtx (V2DImode);
    7733           26 :       emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
    7734              : 
    7735           26 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
    7736              :     }
    7737              : }
    7738              : 
    7739              : /* Expand V2DI mode ashiftrt.  */
    7740              : void
    7741          404 : ix86_expand_v2di_ashiftrt (rtx operands[])
    7742              : {
    7743          404 :   if (operands[2] == const0_rtx)
    7744              :     {
    7745            0 :       emit_move_insn (operands[0], operands[1]);
    7746            0 :       return;
    7747              :     }
    7748              : 
    7749          404 :   if (TARGET_SSE4_2
    7750          133 :       && CONST_INT_P (operands[2])
    7751          133 :       && UINTVAL (operands[2]) >= 63
    7752          412 :       && !optimize_insn_for_size_p ())
    7753              :     {
    7754            8 :       rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
    7755            8 :       emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
    7756            8 :       return;
    7757              :     }
    7758              : 
    7759          396 :   if (CONST_INT_P (operands[2])
    7760          376 :       && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
    7761              :     {
    7762          280 :       vec_perm_builder sel (4, 4, 1);
    7763          280 :       sel.quick_grow (4);
    7764          280 :       rtx arg0, arg1;
    7765          280 :       rtx op1 = lowpart_subreg (V4SImode,
    7766              :                                 force_reg (V2DImode, operands[1]),
    7767              :                                 V2DImode);
    7768          280 :       rtx target = gen_reg_rtx (V4SImode);
    7769          280 :       if (UINTVAL (operands[2]) >= 63)
    7770              :         {
    7771           99 :           arg0 = arg1 = gen_reg_rtx (V4SImode);
    7772           99 :           emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
    7773           99 :           sel[0] = 1;
    7774           99 :           sel[1] = 1;
    7775           99 :           sel[2] = 3;
    7776           99 :           sel[3] = 3;
    7777              :         }
    7778          181 :       else if (INTVAL (operands[2]) > 32)
    7779              :         {
    7780           18 :           arg0 = gen_reg_rtx (V4SImode);
    7781           18 :           arg1 = gen_reg_rtx (V4SImode);
    7782           18 :           emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
    7783           18 :           emit_insn (gen_ashrv4si3 (arg0, op1,
    7784           18 :                                     GEN_INT (INTVAL (operands[2]) - 32)));
    7785           18 :           sel[0] = 1;
    7786           18 :           sel[1] = 5;
    7787           18 :           sel[2] = 3;
    7788           18 :           sel[3] = 7;
    7789              :         }
    7790          163 :       else if (INTVAL (operands[2]) == 32)
    7791              :         {
    7792            5 :           arg0 = op1;
    7793            5 :           arg1 = gen_reg_rtx (V4SImode);
    7794            5 :           emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
    7795            5 :           sel[0] = 1;
    7796            5 :           sel[1] = 5;
    7797            5 :           sel[2] = 3;
    7798            5 :           sel[3] = 7;
    7799              :         }
    7800              :       else
    7801              :         {
    7802          158 :           arg0 = gen_reg_rtx (V2DImode);
    7803          158 :           arg1 = gen_reg_rtx (V4SImode);
    7804          158 :           emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
    7805          158 :           emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
    7806          158 :           arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
    7807          158 :           sel[0] = 0;
    7808          158 :           sel[1] = 5;
    7809          158 :           sel[2] = 2;
    7810          158 :           sel[3] = 7;
    7811              :         }
    7812          379 :       vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
    7813          280 :       rtx op0 = operands[0];
    7814          280 :       bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
    7815              :                                                   target, arg0, arg1,
    7816              :                                                   indices);
    7817          280 :       gcc_assert (ok);
    7818          280 :       emit_move_insn (op0, lowpart_subreg (V2DImode, target, V4SImode));
    7819          280 :       return;
    7820          280 :     }
    7821          116 :   if (!TARGET_XOP)
    7822              :     {
    7823           20 :       rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
    7824           20 :       rtx zero_or_all_ones;
    7825           20 :       if (TARGET_SSE4_2)
    7826              :         {
    7827            0 :           zero_or_all_ones = gen_reg_rtx (V2DImode);
    7828            0 :           emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
    7829              :                                          operands[1]));
    7830              :         }
    7831              :       else
    7832              :         {
    7833           20 :           rtx temp = gen_reg_rtx (V4SImode);
    7834           20 :           emit_insn (gen_ashrv4si3 (temp,
    7835              :                                     lowpart_subreg (V4SImode,
    7836              :                                                     force_reg (V2DImode,
    7837              :                                                                operands[1]),
    7838              :                                                     V2DImode),
    7839              :                                     GEN_INT (31)));
    7840           20 :           zero_or_all_ones = gen_reg_rtx (V4SImode);
    7841           20 :           emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
    7842              :                                         const1_rtx, const1_rtx,
    7843              :                                         GEN_INT (3), GEN_INT (3)));
    7844           20 :           zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
    7845              :                                              V4SImode);
    7846              :         }
    7847           20 :       rtx lshr_res = gen_reg_rtx (V2DImode);
    7848           20 :       emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
    7849           20 :       rtx ashl_res = gen_reg_rtx (V2DImode);
    7850           20 :       rtx amount;
    7851           20 :       if (TARGET_64BIT)
    7852              :         {
    7853           20 :           amount = gen_reg_rtx (DImode);
    7854           20 :           emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
    7855              :                                  operands[2]));
    7856              :         }
    7857              :       else
    7858              :         {
    7859            0 :           rtx temp = gen_reg_rtx (SImode);
    7860            0 :           emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
    7861              :                                  lowpart_subreg (SImode, operands[2],
    7862              :                                                  DImode)));
    7863            0 :           amount = gen_reg_rtx (V4SImode);
    7864            0 :           emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
    7865              :                                         temp));
    7866              :         }
    7867           20 :       amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
    7868           20 :       emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
    7869           20 :       emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
    7870           20 :       return;
    7871              :     }
    7872              : 
    7873           96 :   rtx reg = gen_reg_rtx (V2DImode);
    7874           96 :   rtx par;
    7875           96 :   bool negate = false;
    7876           96 :   int i;
    7877              : 
    7878           96 :   if (CONST_INT_P (operands[2]))
    7879           96 :     operands[2] = GEN_INT (-INTVAL (operands[2]));
    7880              :   else
    7881              :     negate = true;
    7882              : 
    7883           96 :   par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
    7884          288 :   for (i = 0; i < 2; i++)
    7885          192 :     XVECEXP (par, 0, i) = operands[2];
    7886              : 
    7887           96 :   emit_insn (gen_vec_initv2didi (reg, par));
    7888              : 
    7889           96 :   if (negate)
    7890            0 :     emit_insn (gen_negv2di2 (reg, reg));
    7891              : 
    7892           96 :   emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
    7893              : }
    7894              : 
    7895              : /* Replace all occurrences of REG FROM with REG TO in X, including
    7896              :    occurrences with different modes.  */
    7897              : 
    7898              : rtx
    7899        39725 : ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
    7900              : {
    7901        39725 :   gcc_checking_assert (REG_P (from)
    7902              :                        && REG_P (to)
    7903              :                        && GET_MODE (from) == GET_MODE (to));
    7904        39725 :   if (!reg_overlap_mentioned_p (from, x))
    7905              :     return x;
    7906           94 :   rtx ret = copy_rtx (x);
    7907           94 :   subrtx_ptr_iterator::array_type array;
    7908          458 :   FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
    7909              :     {
    7910          364 :       rtx *loc = *iter;
    7911          364 :       x = *loc;
    7912          364 :       if (REG_P (x) && REGNO (x) == REGNO (from))
    7913              :         {
    7914           94 :           if (x == from)
    7915           94 :             *loc = to;
    7916              :           else
    7917              :             {
    7918            0 :               gcc_checking_assert (REG_NREGS (x) == 1);
    7919            0 :               *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
    7920              :             }
    7921              :         }
    7922              :     }
    7923           94 :   return ret;
    7924           94 : }
    7925              : 
    7926              : /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
    7927              :    DImode for constant loop counts.  */
    7928              : 
    7929              : static machine_mode
    7930        32362 : counter_mode (rtx count_exp)
    7931              : {
    7932         7507 :   if (GET_MODE (count_exp) != VOIDmode)
    7933        25679 :     return GET_MODE (count_exp);
    7934         6683 :   if (!CONST_INT_P (count_exp))
    7935            0 :     return Pmode;
    7936              :   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
    7937              :     return DImode;
    7938              :   return SImode;
    7939              : }
    7940              : 
    7941              : /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
    7942              :    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
    7943              :    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
    7944              :    memory by VALUE (supposed to be in MODE).
    7945              : 
    7946              :    The size is rounded down to whole number of chunk size moved at once.
    7947              :    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
    7948              : 
    7949              : 
    7950              : static void
    7951        18256 : expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
    7952              :                                rtx destptr, rtx srcptr, rtx value,
    7953              :                                rtx count, machine_mode mode, int unroll,
    7954              :                                int expected_size, bool issetmem)
    7955              : {
    7956        18256 :   rtx_code_label *out_label = nullptr;
    7957        18256 :   rtx_code_label *top_label = nullptr;
    7958        18256 :   rtx iter, tmp;
    7959        18256 :   machine_mode iter_mode = counter_mode (count);
    7960        18256 :   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
    7961        18256 :   rtx piece_size = GEN_INT (piece_size_n);
    7962        36512 :   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
    7963        18256 :   rtx size;
    7964        18256 :   int i;
    7965        18256 :   int loop_count;
    7966              : 
    7967        18256 :   if (expected_size != -1 && CONST_INT_P (count))
    7968         6604 :     loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
    7969              :   else
    7970              :     loop_count = -1;
    7971              : 
    7972              :   /* Don't generate the loop if the loop count is 1.  */
    7973         6604 :   if (loop_count != 1)
    7974              :     {
    7975        18254 :       top_label = gen_label_rtx ();
    7976        18254 :       out_label = gen_label_rtx ();
    7977              :     }
    7978        18256 :   iter = gen_reg_rtx (iter_mode);
    7979              : 
    7980        18256 :   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
    7981              :                               NULL, 1, OPTAB_DIRECT);
    7982              :   /* Those two should combine.  */
    7983        18256 :   if (piece_size == const1_rtx)
    7984              :     {
    7985         4193 :       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
    7986              :                                true, out_label);
    7987         4193 :       predict_jump (REG_BR_PROB_BASE * 10 / 100);
    7988              :     }
    7989        18256 :   emit_move_insn (iter, const0_rtx);
    7990              : 
    7991        18256 :   if (loop_count != 1)
    7992        18254 :     emit_label (top_label);
    7993              : 
    7994        21001 :   tmp = convert_modes (Pmode, iter_mode, iter, true);
    7995              : 
    7996              :   /* This assert could be relaxed - in this case we'll need to compute
    7997              :      smallest power of two, containing in PIECE_SIZE_N and pass it to
    7998              :      offset_address.  */
    7999        18256 :   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
    8000        18256 :   destmem = offset_address (destmem, tmp, piece_size_n);
    8001        18256 :   destmem = adjust_address (destmem, mode, 0);
    8002              : 
    8003        18256 :   if (!issetmem)
    8004              :     {
    8005        11961 :       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
    8006        11961 :       srcmem = adjust_address (srcmem, mode, 0);
    8007              : 
    8008              :       /* When unrolling for chips that reorder memory reads and writes,
    8009              :          we can save registers by using single temporary.
    8010              :          Also using 4 temporaries is overkill in 32bit mode.  */
    8011        11961 :       if (!TARGET_64BIT && 0)
    8012              :         {
    8013              :           for (i = 0; i < unroll; i++)
    8014              :             {
    8015              :               if (i)
    8016              :                 {
    8017              :                   destmem = adjust_address (copy_rtx (destmem), mode,
    8018              :                                             GET_MODE_SIZE (mode));
    8019              :                   srcmem = adjust_address (copy_rtx (srcmem), mode,
    8020              :                                            GET_MODE_SIZE (mode));
    8021              :                 }
    8022              :               emit_move_insn (destmem, srcmem);
    8023              :             }
    8024              :         }
    8025              :       else
    8026              :         {
    8027        11961 :           rtx tmpreg[4];
    8028        11961 :           gcc_assert (unroll <= 4);
    8029        49514 :           for (i = 0; i < unroll; i++)
    8030              :             {
    8031        37553 :               tmpreg[i] = gen_reg_rtx (mode);
    8032        37553 :               if (i)
    8033        51184 :                 srcmem = adjust_address (copy_rtx (srcmem), mode,
    8034              :                                          GET_MODE_SIZE (mode));
    8035        37553 :               emit_move_insn (tmpreg[i], srcmem);
    8036              :             }
    8037        49514 :           for (i = 0; i < unroll; i++)
    8038              :             {
    8039        37553 :               if (i)
    8040        51184 :                 destmem = adjust_address (copy_rtx (destmem), mode,
    8041              :                                           GET_MODE_SIZE (mode));
    8042        37553 :               emit_move_insn (destmem, tmpreg[i]);
    8043              :             }
    8044              :         }
    8045              :     }
    8046              :   else
    8047        29018 :     for (i = 0; i < unroll; i++)
    8048              :       {
    8049        22723 :         if (i)
    8050        32856 :           destmem = adjust_address (copy_rtx (destmem), mode,
    8051              :                                     GET_MODE_SIZE (mode));
    8052        22723 :         emit_move_insn (destmem, value);
    8053              :       }
    8054              : 
    8055        18256 :   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
    8056              :                              true, OPTAB_LIB_WIDEN);
    8057        18256 :   if (tmp != iter)
    8058            0 :     emit_move_insn (iter, tmp);
    8059              : 
    8060        18256 :   if (loop_count != 1)
    8061              :     {
    8062        18254 :       emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
    8063              :                                true, top_label);
    8064        18254 :       if (expected_size != -1)
    8065              :         {
    8066         9105 :           expected_size /= GET_MODE_SIZE (mode) * unroll;
    8067         9105 :           if (expected_size == 0)
    8068            1 :             predict_jump (0);
    8069         9104 :           else if (expected_size > REG_BR_PROB_BASE)
    8070            2 :             predict_jump (REG_BR_PROB_BASE - 1);
    8071              :           else
    8072         9102 :             predict_jump (REG_BR_PROB_BASE
    8073         9102 :                           - (REG_BR_PROB_BASE + expected_size / 2)
    8074         9102 :                             / expected_size);
    8075              :         }
    8076              :       else
    8077         9149 :         predict_jump (REG_BR_PROB_BASE * 80 / 100);
    8078              :     }
    8079        18256 :   iter = ix86_zero_extend_to_Pmode (iter);
    8080        21001 :   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
    8081              :                              true, OPTAB_LIB_WIDEN);
    8082        18256 :   if (tmp != destptr)
    8083            0 :     emit_move_insn (destptr, tmp);
    8084        18256 :   if (!issetmem)
    8085              :     {
    8086        13308 :       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
    8087              :                                  true, OPTAB_LIB_WIDEN);
    8088        11961 :       if (tmp != srcptr)
    8089            0 :         emit_move_insn (srcptr, tmp);
    8090              :     }
    8091        18256 :   if (loop_count != 1)
    8092        18254 :     emit_label (out_label);
    8093        18256 : }
    8094              : 
    8095              : /* Divide COUNTREG by SCALE.  */
    8096              : static rtx
    8097        16660 : scale_counter (rtx countreg, int scale)
    8098              : {
    8099        16660 :   rtx sc;
    8100              : 
    8101        16660 :   if (scale == 1)
    8102              :     return countreg;
    8103        10742 :   if (CONST_INT_P (countreg))
    8104        10725 :     return GEN_INT (INTVAL (countreg) / scale);
    8105           17 :   gcc_assert (REG_P (countreg));
    8106              : 
    8107           51 :   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
    8108           34 :                             GEN_INT (exact_log2 (scale)),
    8109              :                             NULL, 1, OPTAB_DIRECT);
    8110           17 :   return sc;
    8111              : }
    8112              : 
    8113              : /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
    8114              :    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
    8115              :    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
    8116              :    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
    8117              :    ORIG_VALUE is the original value passed to memset to fill the memory with.
    8118              :    Other arguments have same meaning as for previous function.  */
    8119              : 
    8120              : static void
    8121        16660 : expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
    8122              :                            rtx destptr, rtx srcptr, rtx value, rtx orig_value,
    8123              :                            rtx count,
    8124              :                            machine_mode mode, bool issetmem)
    8125              : {
    8126        16660 :   rtx destexp;
    8127        16660 :   rtx srcexp;
    8128        16660 :   rtx countreg;
    8129        16660 :   HOST_WIDE_INT rounded_count;
    8130              : 
    8131              :   /* If possible, it is shorter to use rep movs.
    8132              :      TODO: Maybe it is better to move this logic to decide_alg.  */
    8133        16660 :   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
    8134          243 :       && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
    8135          239 :       && (!issetmem || orig_value == const0_rtx))
    8136        16660 :     mode = SImode;
    8137              : 
    8138        16660 :   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
    8139        16382 :     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
    8140              : 
    8141        33320 :   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
    8142        16660 :                                                        GET_MODE_SIZE (mode)));
    8143        16660 :   if (mode != QImode)
    8144              :     {
    8145        32486 :       destexp = gen_rtx_ASHIFT (Pmode, countreg,
    8146              :                                 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
    8147        11002 :       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
    8148              :     }
    8149              :   else
    8150         5940 :     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
    8151        16660 :   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
    8152              :     {
    8153        11412 :       rounded_count
    8154        11412 :         = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
    8155        11412 :       destmem = shallow_copy_rtx (destmem);
    8156        11412 :       set_mem_size (destmem, rounded_count);
    8157              :     }
    8158         5256 :   else if (MEM_SIZE_KNOWN_P (destmem))
    8159          333 :     clear_mem_size (destmem);
    8160              : 
    8161        16660 :   if (issetmem)
    8162              :     {
    8163         6067 :       value = force_reg (mode, gen_lowpart (mode, value));
    8164         6067 :       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
    8165              :     }
    8166              :   else
    8167              :     {
    8168        10593 :       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
    8169        10378 :         srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
    8170        10593 :       if (mode != QImode)
    8171              :         {
    8172        18160 :           srcexp = gen_rtx_ASHIFT (Pmode, countreg,
    8173              :                                    GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
    8174         6176 :           srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
    8175              :         }
    8176              :       else
    8177         4619 :         srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
    8178        10593 :       if (CONST_INT_P (count))
    8179              :         {
    8180         6487 :           rounded_count
    8181         6487 :             = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
    8182         6487 :           srcmem = shallow_copy_rtx (srcmem);
    8183         6487 :           set_mem_size (srcmem, rounded_count);
    8184              :         }
    8185              :       else
    8186              :         {
    8187         4120 :           if (MEM_SIZE_KNOWN_P (srcmem))
    8188            0 :             clear_mem_size (srcmem);
    8189              :         }
    8190        10593 :       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
    8191              :                               destexp, srcexp));
    8192              :     }
    8193        16660 : }
    8194              : 
    8195              : /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
    8196              :    DESTMEM.
    8197              :    SRC is passed by pointer to be updated on return.
    8198              :    Return value is updated DST.  */
    8199              : static rtx
    8200           13 : emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
    8201              :              HOST_WIDE_INT size_to_move)
    8202              : {
    8203           13 :   rtx dst = destmem, src = *srcmem, tempreg;
    8204           13 :   enum insn_code code;
    8205           13 :   machine_mode move_mode;
    8206           13 :   int piece_size, i;
    8207              : 
    8208              :   /* Find the widest mode in which we could perform moves.
    8209              :      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
    8210              :      it until move of such size is supported.  */
    8211           13 :   piece_size = 1 << floor_log2 (size_to_move);
    8212           26 :   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
    8213           26 :          || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
    8214              :     {
    8215            0 :       gcc_assert (piece_size > 1);
    8216            0 :       piece_size >>= 1;
    8217              :     }
    8218              : 
    8219              :   /* Find the corresponding vector mode with the same size as MOVE_MODE.
    8220              :      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
    8221           39 :   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
    8222              :     {
    8223            0 :       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
    8224            0 :       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
    8225            0 :           || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
    8226              :         {
    8227            0 :           move_mode = word_mode;
    8228            0 :           piece_size = GET_MODE_SIZE (move_mode);
    8229            0 :           code = optab_handler (mov_optab, move_mode);
    8230              :         }
    8231              :     }
    8232           13 :   gcc_assert (code != CODE_FOR_nothing);
    8233              : 
    8234           13 :   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
    8235           13 :   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
    8236              : 
    8237              :   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
    8238           13 :   gcc_assert (size_to_move % piece_size == 0);
    8239              : 
    8240           26 :   for (i = 0; i < size_to_move; i += piece_size)
    8241              :     {
    8242              :       /* We move from memory to memory, so we'll need to do it via
    8243              :          a temporary register.  */
    8244           13 :       tempreg = gen_reg_rtx (move_mode);
    8245           13 :       emit_insn (GEN_FCN (code) (tempreg, src));
    8246           13 :       emit_insn (GEN_FCN (code) (dst, tempreg));
    8247              : 
    8248           26 :       emit_move_insn (destptr,
    8249           13 :                       plus_constant (Pmode, copy_rtx (destptr), piece_size));
    8250           26 :       emit_move_insn (srcptr,
    8251           13 :                       plus_constant (Pmode, copy_rtx (srcptr), piece_size));
    8252              : 
    8253           13 :       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8254              :                                           piece_size);
    8255           13 :       src = adjust_automodify_address_nv (src, move_mode, srcptr,
    8256              :                                           piece_size);
    8257              :     }
    8258              : 
    8259              :   /* Update DST and SRC rtx.  */
    8260           13 :   *srcmem = src;
    8261           13 :   return dst;
    8262              : }
    8263              : 
    8264              : /* Helper function for the string operations below.  Dest VARIABLE whether
    8265              :    it is aligned to VALUE bytes.  If true, jump to the label.  */
    8266              : 
    8267              : static rtx_code_label *
    8268        35973 : ix86_expand_aligntest (rtx variable, int value, bool epilogue)
    8269              : {
    8270        35973 :   rtx_code_label *label = gen_label_rtx ();
    8271        35973 :   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
    8272        35973 :   if (GET_MODE (variable) == DImode)
    8273          905 :     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
    8274              :   else
    8275        35068 :     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
    8276        35973 :   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
    8277              :                            1, label);
    8278        35973 :   if (epilogue)
    8279            3 :     predict_jump (REG_BR_PROB_BASE * 50 / 100);
    8280              :   else
    8281        35970 :     predict_jump (REG_BR_PROB_BASE * 90 / 100);
    8282        35973 :   return label;
    8283              : }
    8284              : 
    8285              : 
    8286              : /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
    8287              : 
    8288              : static void
    8289         8580 : expand_cpymem_epilogue (rtx destmem, rtx srcmem,
    8290              :                         rtx destptr, rtx srcptr, rtx count, int max_size)
    8291              : {
    8292         8580 :   rtx src, dest;
    8293         8580 :   if (CONST_INT_P (count))
    8294              :     {
    8295         6401 :       unsigned HOST_WIDE_INT countval = UINTVAL (count);
    8296         6401 :       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
    8297         6401 :       unsigned int destalign = MEM_ALIGN (destmem);
    8298         6401 :       cfun->machine->by_pieces_in_use = true;
    8299         6401 :       move_by_pieces (destmem, srcmem, epilogue_size, destalign,
    8300              :                       RETURN_BEGIN);
    8301         6401 :       cfun->machine->by_pieces_in_use = false;
    8302         6401 :       return;
    8303              :     }
    8304         2179 :   if (max_size > 8)
    8305              :     {
    8306         2179 :       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
    8307              :                                     count, 1, OPTAB_DIRECT);
    8308         2179 :       expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
    8309              :                                      count, QImode, 1, 4, false);
    8310         2179 :       return;
    8311              :     }
    8312              : 
    8313              :   /* When there are stringops, we can cheaply increase dest and src pointers.
    8314              :      Otherwise we save code size by maintaining offset (zero is readily
    8315              :      available from preceding rep operation) and using x86 addressing modes.
    8316              :    */
    8317            0 :   if (TARGET_SINGLE_STRINGOP)
    8318              :     {
    8319            0 :       if (max_size > 4)
    8320              :         {
    8321            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8322            0 :           src = change_address (srcmem, SImode, srcptr);
    8323            0 :           dest = change_address (destmem, SImode, destptr);
    8324            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8325            0 :           emit_label (label);
    8326            0 :           LABEL_NUSES (label) = 1;
    8327              :         }
    8328            0 :       if (max_size > 2)
    8329              :         {
    8330            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8331            0 :           src = change_address (srcmem, HImode, srcptr);
    8332            0 :           dest = change_address (destmem, HImode, destptr);
    8333            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8334            0 :           emit_label (label);
    8335            0 :           LABEL_NUSES (label) = 1;
    8336              :         }
    8337            0 :       if (max_size > 1)
    8338              :         {
    8339            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8340            0 :           src = change_address (srcmem, QImode, srcptr);
    8341            0 :           dest = change_address (destmem, QImode, destptr);
    8342            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8343            0 :           emit_label (label);
    8344            0 :           LABEL_NUSES (label) = 1;
    8345              :         }
    8346              :     }
    8347              :   else
    8348              :     {
    8349            0 :       rtx offset = force_reg (Pmode, const0_rtx);
    8350            0 :       rtx tmp;
    8351              : 
    8352            0 :       if (max_size > 4)
    8353              :         {
    8354            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8355            0 :           src = change_address (srcmem, SImode, srcptr);
    8356            0 :           dest = change_address (destmem, SImode, destptr);
    8357            0 :           emit_move_insn (dest, src);
    8358            0 :           tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
    8359              :                                      true, OPTAB_LIB_WIDEN);
    8360            0 :           if (tmp != offset)
    8361            0 :             emit_move_insn (offset, tmp);
    8362            0 :           emit_label (label);
    8363            0 :           LABEL_NUSES (label) = 1;
    8364              :         }
    8365            0 :       if (max_size > 2)
    8366              :         {
    8367            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8368            0 :           tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
    8369            0 :           src = change_address (srcmem, HImode, tmp);
    8370            0 :           tmp = gen_rtx_PLUS (Pmode, destptr, offset);
    8371            0 :           dest = change_address (destmem, HImode, tmp);
    8372            0 :           emit_move_insn (dest, src);
    8373            0 :           tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
    8374              :                                      true, OPTAB_LIB_WIDEN);
    8375            0 :           if (tmp != offset)
    8376            0 :             emit_move_insn (offset, tmp);
    8377            0 :           emit_label (label);
    8378            0 :           LABEL_NUSES (label) = 1;
    8379              :         }
    8380            0 :       if (max_size > 1)
    8381              :         {
    8382            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8383            0 :           tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
    8384            0 :           src = change_address (srcmem, QImode, tmp);
    8385            0 :           tmp = gen_rtx_PLUS (Pmode, destptr, offset);
    8386            0 :           dest = change_address (destmem, QImode, tmp);
    8387            0 :           emit_move_insn (dest, src);
    8388            0 :           emit_label (label);
    8389            0 :           LABEL_NUSES (label) = 1;
    8390              :         }
    8391              :     }
    8392              : }
    8393              : 
    8394              : /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
    8395              :    with value PROMOTED_VAL.
    8396              :    SRC is passed by pointer to be updated on return.
    8397              :    Return value is updated DST.  */
    8398              : static rtx
    8399            6 : emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
    8400              :              HOST_WIDE_INT size_to_move)
    8401              : {
    8402            6 :   rtx dst = destmem;
    8403            6 :   enum insn_code code;
    8404            6 :   machine_mode move_mode;
    8405            6 :   int piece_size, i;
    8406              : 
    8407              :   /* Find the widest mode in which we could perform moves.
    8408              :      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
    8409              :      it until move of such size is supported.  */
    8410            6 :   move_mode = GET_MODE (promoted_val);
    8411            6 :   if (move_mode == VOIDmode)
    8412            0 :     move_mode = QImode;
    8413           12 :   if (size_to_move < GET_MODE_SIZE (move_mode))
    8414              :     {
    8415            5 :       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
    8416            5 :       move_mode = int_mode_for_size (move_bits, 0).require ();
    8417            5 :       promoted_val = gen_lowpart (move_mode, promoted_val);
    8418              :     }
    8419            6 :   piece_size = GET_MODE_SIZE (move_mode);
    8420            6 :   code = optab_handler (mov_optab, move_mode);
    8421            6 :   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
    8422              : 
    8423            6 :   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
    8424              : 
    8425              :   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
    8426            6 :   gcc_assert (size_to_move % piece_size == 0);
    8427              : 
    8428           12 :   for (i = 0; i < size_to_move; i += piece_size)
    8429              :     {
    8430           12 :       if (piece_size <= GET_MODE_SIZE (word_mode))
    8431              :         {
    8432            4 :           emit_insn (gen_strset (destptr, dst, promoted_val));
    8433            4 :           dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8434              :                                               piece_size);
    8435            4 :           continue;
    8436              :         }
    8437              : 
    8438            2 :       emit_insn (GEN_FCN (code) (dst, promoted_val));
    8439              : 
    8440            4 :       emit_move_insn (destptr,
    8441            2 :                       plus_constant (Pmode, copy_rtx (destptr), piece_size));
    8442              : 
    8443            2 :       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8444              :                                           piece_size);
    8445              :     }
    8446              : 
    8447              :   /* Update DST rtx.  */
    8448            6 :   return dst;
    8449              : }
    8450              : /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
    8451              : static void
    8452          311 : expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
    8453              :                                  rtx count, int max_size)
    8454              : {
    8455          622 :   count = expand_simple_binop (counter_mode (count), AND, count,
    8456          311 :                                GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
    8457          311 :   expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
    8458          311 :                                  gen_lowpart (QImode, value), count, QImode,
    8459              :                                  1, max_size / 2, true);
    8460          311 : }
    8461              : 
    8462              : /* Callback routine for store_by_pieces.  Return the RTL of a register
    8463              :    containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
    8464              :    is an integer or a word vector register.  If PREV_P isn't nullptr,
    8465              :    it has the RTL info from the previous iteration.  */
    8466              : 
    8467              : static rtx
    8468         5018 : setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
    8469              :                          fixed_size_mode mode)
    8470              : {
    8471         5018 :   rtx target;
    8472         5018 :   by_pieces_prev *prev = (by_pieces_prev *) prev_p;
    8473         5018 :   if (prev)
    8474              :     {
    8475         5018 :       rtx prev_op = prev->data;
    8476         5018 :       if (prev_op)
    8477              :         {
    8478         2908 :           machine_mode prev_mode = GET_MODE (prev_op);
    8479         2908 :           if (prev_mode == mode)
    8480              :             return prev_op;
    8481           54 :           if (VECTOR_MODE_P (prev_mode)
    8482         1098 :               && VECTOR_MODE_P (mode)
    8483         1152 :               && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
    8484              :             {
    8485            0 :               target = gen_rtx_SUBREG (mode, prev_op, 0);
    8486            0 :               return target;
    8487              :             }
    8488              :         }
    8489              :     }
    8490              : 
    8491         3262 :   rtx op = (rtx) op_p;
    8492         3262 :   machine_mode op_mode = GET_MODE (op);
    8493              : 
    8494         3262 :   if (VECTOR_MODE_P (mode))
    8495              :     {
    8496         3692 :       gcc_assert (GET_MODE_INNER (mode) == QImode);
    8497              : 
    8498         1846 :       unsigned int op_size = GET_MODE_SIZE (op_mode);
    8499         1846 :       unsigned int size = GET_MODE_SIZE (mode);
    8500         1846 :       unsigned int nunits;
    8501         1846 :       machine_mode vec_mode;
    8502         1846 :       if (op_size < size)
    8503              :         {
    8504              :           /* If OP size is smaller than MODE size, duplicate it.  */
    8505            1 :           nunits = size / GET_MODE_SIZE (QImode);
    8506            1 :           vec_mode = mode_for_vector (QImode, nunits).require ();
    8507            1 :           nunits = size / op_size;
    8508            1 :           gcc_assert (SCALAR_INT_MODE_P (op_mode));
    8509            1 :           machine_mode dup_mode
    8510            1 :             = mode_for_vector (as_a <scalar_mode> (op_mode),
    8511            2 :                                nunits).require ();
    8512            1 :           target = gen_reg_rtx (vec_mode);
    8513            1 :           op = gen_vec_duplicate (dup_mode, op);
    8514            1 :           rtx dup_op = gen_reg_rtx (dup_mode);
    8515            1 :           emit_move_insn (dup_op, op);
    8516            1 :           op = gen_rtx_SUBREG (vec_mode, dup_op, 0);
    8517            1 :           emit_move_insn (target, op);
    8518            1 :           return target;
    8519              :         }
    8520         1845 :       nunits = op_size / GET_MODE_SIZE (QImode);
    8521         1845 :       vec_mode = mode_for_vector (QImode, nunits).require ();
    8522         1845 :       target = gen_reg_rtx (vec_mode);
    8523         1845 :       op = gen_rtx_SUBREG (vec_mode, op, 0);
    8524         1845 :       emit_move_insn (target, op);
    8525         1845 :       if (op_size == size)
    8526              :         return target;
    8527              : 
    8528            0 :       rtx tmp = gen_reg_rtx (mode);
    8529            0 :       target = gen_rtx_SUBREG (mode, target, 0);
    8530            0 :       emit_move_insn (tmp, target);
    8531            0 :       return tmp;
    8532              :     }
    8533              : 
    8534         1416 :   if (VECTOR_MODE_P (op_mode))
    8535              :     {
    8536         2822 :       gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
    8537         1411 :       target = gen_reg_rtx (word_mode);
    8538         1411 :       op = gen_rtx_SUBREG (word_mode, op, 0);
    8539         1411 :       emit_move_insn (target, op);
    8540              :     }
    8541              :   else
    8542              :     target = op;
    8543              : 
    8544         1416 :   if (mode == GET_MODE (target))
    8545              :     return target;
    8546              : 
    8547          241 :   rtx tmp = gen_reg_rtx (mode);
    8548          241 :   target = gen_rtx_SUBREG (mode, target, 0);
    8549          241 :   emit_move_insn (tmp, target);
    8550          241 :   return tmp;
    8551              : }
    8552              : 
    8553              : /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
    8554              : static void
    8555         7916 : expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
    8556              :                         rtx count, int max_size)
    8557              : {
    8558         7916 :   rtx dest;
    8559              : 
    8560         7916 :   if (CONST_INT_P (count))
    8561              :     {
    8562         7604 :       unsigned HOST_WIDE_INT countval = UINTVAL (count);
    8563         7604 :       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
    8564         7604 :       unsigned int destalign = MEM_ALIGN (destmem);
    8565         7604 :       cfun->machine->by_pieces_in_use = true;
    8566        12303 :       store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
    8567              :                        vec_value ? vec_value : value, destalign, true,
    8568              :                        RETURN_BEGIN);
    8569         7604 :       cfun->machine->by_pieces_in_use = false;
    8570         7604 :       return;
    8571              :     }
    8572          312 :   if (max_size > 32)
    8573              :     {
    8574          311 :       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
    8575          311 :       return;
    8576              :     }
    8577            1 :   if (max_size > 16)
    8578              :     {
    8579            0 :       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
    8580            0 :       if (TARGET_64BIT)
    8581              :         {
    8582            0 :           dest = change_address (destmem, DImode, destptr);
    8583            0 :           emit_insn (gen_strset (destptr, dest, value));
    8584            0 :           dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
    8585            0 :           emit_insn (gen_strset (destptr, dest, value));
    8586              :         }
    8587              :       else
    8588              :         {
    8589            0 :           dest = change_address (destmem, SImode, destptr);
    8590            0 :           emit_insn (gen_strset (destptr, dest, value));
    8591            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
    8592            0 :           emit_insn (gen_strset (destptr, dest, value));
    8593            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
    8594            0 :           emit_insn (gen_strset (destptr, dest, value));
    8595            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
    8596            0 :           emit_insn (gen_strset (destptr, dest, value));
    8597              :         }
    8598            0 :       emit_label (label);
    8599            0 :       LABEL_NUSES (label) = 1;
    8600              :     }
    8601            1 :   if (max_size > 8)
    8602              :     {
    8603            0 :       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
    8604            0 :       if (TARGET_64BIT)
    8605              :         {
    8606            0 :           dest = change_address (destmem, DImode, destptr);
    8607            0 :           emit_insn (gen_strset (destptr, dest, value));
    8608              :         }
    8609              :       else
    8610              :         {
    8611            0 :           dest = change_address (destmem, SImode, destptr);
    8612            0 :           emit_insn (gen_strset (destptr, dest, value));
    8613            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
    8614            0 :           emit_insn (gen_strset (destptr, dest, value));
    8615              :         }
    8616            0 :       emit_label (label);
    8617            0 :       LABEL_NUSES (label) = 1;
    8618              :     }
    8619            1 :   if (max_size > 4)
    8620              :     {
    8621            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8622            1 :       dest = change_address (destmem, SImode, destptr);
    8623            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
    8624            1 :       emit_label (label);
    8625            1 :       LABEL_NUSES (label) = 1;
    8626              :     }
    8627            1 :   if (max_size > 2)
    8628              :     {
    8629            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8630            1 :       dest = change_address (destmem, HImode, destptr);
    8631            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
    8632            1 :       emit_label (label);
    8633            1 :       LABEL_NUSES (label) = 1;
    8634              :     }
    8635            1 :   if (max_size > 1)
    8636              :     {
    8637            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8638            1 :       dest = change_address (destmem, QImode, destptr);
    8639            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
    8640            1 :       emit_label (label);
    8641            1 :       LABEL_NUSES (label) = 1;
    8642              :     }
    8643              : }
    8644              : 
    8645              : /* Adjust COUNTER by the VALUE.  */
    8646              : static void
    8647           19 : ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
    8648              : {
    8649           19 :   emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
    8650           19 : }
    8651              : 
    8652              : /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
    8653              :    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
    8654              :    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
    8655              :    ignored.
    8656              :    Return value is updated DESTMEM.  */
    8657              : 
    8658              : static rtx
    8659            7 : expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
    8660              :                                   rtx destptr, rtx srcptr, rtx value,
    8661              :                                   rtx vec_value, rtx count, int align,
    8662              :                                   int desired_alignment, bool issetmem)
    8663              : {
    8664            7 :   int i;
    8665           35 :   for (i = 1; i < desired_alignment; i <<= 1)
    8666              :     {
    8667           28 :       if (align <= i)
    8668              :         {
    8669           19 :           rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
    8670           19 :           if (issetmem)
    8671              :             {
    8672           12 :               if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
    8673            2 :                 destmem = emit_memset (destmem, destptr, vec_value, i);
    8674              :               else
    8675            4 :                 destmem = emit_memset (destmem, destptr, value, i);
    8676              :             }
    8677              :           else
    8678           13 :             destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
    8679           19 :           ix86_adjust_counter (count, i);
    8680           19 :           emit_label (label);
    8681           19 :           LABEL_NUSES (label) = 1;
    8682           19 :           set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
    8683              :         }
    8684              :     }
    8685            7 :   return destmem;
    8686              : }
    8687              : 
    8688              : /* Test if COUNT&SIZE is nonzero and if so, expand movme
    8689              :    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
    8690              :    and jump to DONE_LABEL.  */
    8691              : static void
    8692        28754 : expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
    8693              :                                rtx destptr, rtx srcptr,
    8694              :                                rtx value, rtx vec_value,
    8695              :                                rtx count, int size,
    8696              :                                rtx done_label, bool issetmem)
    8697              : {
    8698        28754 :   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
    8699        28754 :   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
    8700        28754 :   rtx modesize;
    8701        28754 :   rtx scalar_value = value;
    8702        28754 :   int n;
    8703              : 
    8704              :   /* If we do not have vector value to copy, we must reduce size.  */
    8705        28754 :   if (issetmem)
    8706              :     {
    8707         3618 :       if (!vec_value)
    8708              :         {
    8709            9 :           if (GET_MODE (value) == VOIDmode && size > 8)
    8710            0 :             mode = Pmode;
    8711           27 :           else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
    8712            1 :             mode = GET_MODE (value);
    8713              :         }
    8714              :       else
    8715         3609 :         mode = GET_MODE (vec_value), value = vec_value;
    8716              :     }
    8717              :   else
    8718              :     {
    8719              :       /* Choose appropriate vector mode.  */
    8720        25136 :       if (size >= 32)
    8721         6282 :         switch (MOVE_MAX)
    8722              :           {
    8723            0 :           case 64:
    8724            0 :             if (size >= 64)
    8725              :               {
    8726              :                 mode = V64QImode;
    8727              :                 break;
    8728              :               }
    8729              :             /* FALLTHRU */
    8730            0 :           case 32:
    8731            0 :             mode = V32QImode;
    8732            0 :             break;
    8733              :           case 16:
    8734              :             mode = V16QImode;
    8735              :             break;
    8736              :           case 8:
    8737              :             mode = DImode;
    8738              :             break;
    8739            0 :           default:
    8740            0 :             gcc_unreachable ();
    8741              :           }
    8742        18854 :       else if (size >= 16)
    8743         6282 :         mode = TARGET_SSE ? V16QImode : DImode;
    8744        25136 :       srcmem = change_address (srcmem, mode, srcptr);
    8745              :     }
    8746        32363 :   if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
    8747              :     {
    8748              :       /* For memset with vector and the size is smaller than the vector
    8749              :          size, first try the narrower vector, otherwise, use the
    8750              :          original value. */
    8751         1809 :       machine_mode inner_mode = GET_MODE_INNER (mode);
    8752         1809 :       unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
    8753         1809 :       if (nunits > 1)
    8754              :         {
    8755          320 :           mode = mode_for_vector (GET_MODE_INNER (mode),
    8756          320 :                                   nunits).require ();
    8757          160 :           value = gen_rtx_SUBREG (mode, value, 0);
    8758              :         }
    8759              :       else
    8760              :         {
    8761         1649 :           scalar_int_mode smode
    8762         1649 :             = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
    8763         4947 :           gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
    8764              :                       >= GET_MODE_SIZE (smode));
    8765         1649 :           mode = smode;
    8766         1649 :           if (GET_MODE (scalar_value) == mode)
    8767              :             value = scalar_value;
    8768              :           else
    8769          749 :             value = gen_rtx_SUBREG (mode, scalar_value, 0);
    8770              :         }
    8771              :     }
    8772        28754 :   destmem = change_address (destmem, mode, destptr);
    8773        57508 :   modesize = GEN_INT (GET_MODE_SIZE (mode));
    8774        57508 :   gcc_assert (GET_MODE_SIZE (mode) <= size);
    8775       129382 :   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
    8776              :     {
    8777        35937 :       if (issetmem)
    8778         4519 :         emit_move_insn (destmem, gen_lowpart (mode, value));
    8779              :       else
    8780              :         {
    8781        31418 :           emit_move_insn (destmem, srcmem);
    8782        62836 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8783              :         }
    8784        71874 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8785              :     }
    8786              : 
    8787        28754 :   destmem = offset_address (destmem, count, 1);
    8788        57508 :   destmem = offset_address (destmem, GEN_INT (-2 * size),
    8789        28754 :                             GET_MODE_SIZE (mode));
    8790        28754 :   if (!issetmem)
    8791              :     {
    8792        25136 :       srcmem = offset_address (srcmem, count, 1);
    8793        50272 :       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
    8794        25136 :                                GET_MODE_SIZE (mode));
    8795              :     }
    8796       129382 :   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
    8797              :     {
    8798        35937 :       if (issetmem)
    8799         4519 :         emit_move_insn (destmem, gen_lowpart (mode, value));
    8800              :       else
    8801              :         {
    8802        31418 :           emit_move_insn (destmem, srcmem);
    8803        62836 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8804              :         }
    8805        71874 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8806              :     }
    8807        28754 :   emit_jump_insn (gen_jump (done_label));
    8808        28754 :   emit_barrier ();
    8809              : 
    8810        28754 :   emit_label (label);
    8811        28754 :   LABEL_NUSES (label) = 1;
    8812        28754 : }
    8813              : 
    8814              : /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
    8815              :    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
    8816              :    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
    8817              :    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
    8818              :    DONE_LABEL is a label after the whole copying sequence. The label is created
    8819              :    on demand if *DONE_LABEL is NULL.
    8820              :    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
    8821              :    bounds after the initial copies.
    8822              : 
    8823              :    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
    8824              :    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
    8825              :    we will dispatch to a library call for large blocks.
    8826              : 
    8827              :    In pseudocode we do:
    8828              : 
    8829              :    if (COUNT < SIZE)
    8830              :      {
    8831              :        Assume that SIZE is 4. Bigger sizes are handled analogously
    8832              :        if (COUNT & 4)
    8833              :          {
    8834              :             copy 4 bytes from SRCPTR to DESTPTR
    8835              :             copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
    8836              :             goto done_label
    8837              :          }
    8838              :        if (!COUNT)
    8839              :          goto done_label;
    8840              :        copy 1 byte from SRCPTR to DESTPTR
    8841              :        if (COUNT & 2)
    8842              :          {
    8843              :             copy 2 bytes from SRCPTR to DESTPTR
    8844              :             copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
    8845              :          }
    8846              :      }
    8847              :    else
    8848              :      {
    8849              :        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
    8850              :        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
    8851              : 
    8852              :        OLD_DESPTR = DESTPTR;
    8853              :        Align DESTPTR up to DESIRED_ALIGN
    8854              :        SRCPTR += DESTPTR - OLD_DESTPTR
    8855              :        COUNT -= DEST_PTR - OLD_DESTPTR
    8856              :        if (DYNAMIC_CHECK)
    8857              :          Round COUNT down to multiple of SIZE
    8858              :        << optional caller supplied zero size guard is here >>
    8859              :        << optional caller supplied dynamic check is here >>
    8860              :        << caller supplied main copy loop is here >>
    8861              :      }
    8862              :    done_label:
    8863              :   */
    8864              : static void
    8865        10546 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
    8866              :                                                             rtx *destptr, rtx *srcptr,
    8867              :                                                             machine_mode mode,
    8868              :                                                             rtx value, rtx vec_value,
    8869              :                                                             rtx *count,
    8870              :                                                             rtx_code_label **done_label,
    8871              :                                                             int size,
    8872              :                                                             int desired_align,
    8873              :                                                             int align,
    8874              :                                                             unsigned HOST_WIDE_INT *min_size,
    8875              :                                                             bool dynamic_check,
    8876              :                                                             bool issetmem)
    8877              : {
    8878        10546 :   rtx_code_label *loop_label = NULL, *label;
    8879        10546 :   int n;
    8880        10546 :   rtx modesize;
    8881        10546 :   int prolog_size = 0;
    8882        10546 :   rtx mode_value;
    8883              : 
    8884              :   /* Chose proper value to copy.  */
    8885        10546 :   if (issetmem && VECTOR_MODE_P (mode))
    8886              :     mode_value = vec_value;
    8887              :   else
    8888        10546 :     mode_value = value;
    8889        21092 :   gcc_assert (GET_MODE_SIZE (mode) <= size);
    8890              : 
    8891              :   /* See if block is big or small, handle small blocks.  */
    8892        10546 :   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
    8893              :     {
    8894         7197 :       int size2 = size;
    8895         7197 :       loop_label = gen_label_rtx ();
    8896              : 
    8897         7197 :       if (!*done_label)
    8898         7197 :         *done_label = gen_label_rtx ();
    8899              : 
    8900         7197 :       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
    8901              :                                1, loop_label);
    8902         7197 :       size2 >>= 1;
    8903              : 
    8904              :       /* Handle sizes > 3.  */
    8905        35951 :       for (;size2 > 2; size2 >>= 1)
    8906        28754 :         expand_small_cpymem_or_setmem (destmem, srcmem,
    8907              :                                        *destptr, *srcptr,
    8908              :                                        value, vec_value,
    8909              :                                        *count,
    8910              :                                        size2, *done_label, issetmem);
    8911              :       /* Nothing to copy?  Jump to DONE_LABEL if so */
    8912         7197 :       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
    8913              :                                1, *done_label);
    8914              : 
    8915              :       /* Do a byte copy.  */
    8916         7197 :       destmem = change_address (destmem, QImode, *destptr);
    8917         7197 :       if (issetmem)
    8918          907 :         emit_move_insn (destmem, gen_lowpart (QImode, value));
    8919              :       else
    8920              :         {
    8921         6290 :           srcmem = change_address (srcmem, QImode, *srcptr);
    8922         6290 :           emit_move_insn (destmem, srcmem);
    8923              :         }
    8924              : 
    8925              :       /* Handle sizes 2 and 3.  */
    8926         7197 :       label = ix86_expand_aligntest (*count, 2, false);
    8927         7197 :       destmem = change_address (destmem, HImode, *destptr);
    8928         7197 :       destmem = offset_address (destmem, *count, 1);
    8929         7197 :       destmem = offset_address (destmem, GEN_INT (-2), 2);
    8930         7197 :       if (issetmem)
    8931          907 :         emit_move_insn (destmem, gen_lowpart (HImode, value));
    8932              :       else
    8933              :         {
    8934         6290 :           srcmem = change_address (srcmem, HImode, *srcptr);
    8935         6290 :           srcmem = offset_address (srcmem, *count, 1);
    8936         6290 :           srcmem = offset_address (srcmem, GEN_INT (-2), 2);
    8937         6290 :           emit_move_insn (destmem, srcmem);
    8938              :         }
    8939              : 
    8940         7197 :       emit_label (label);
    8941         7197 :       LABEL_NUSES (label) = 1;
    8942         7197 :       emit_jump_insn (gen_jump (*done_label));
    8943         7197 :       emit_barrier ();
    8944              :     }
    8945              :   else
    8946         3349 :     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
    8947              :                 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
    8948              : 
    8949              :   /* Start memcpy for COUNT >= SIZE.  */
    8950         7197 :   if (loop_label)
    8951              :     {
    8952         7197 :        emit_label (loop_label);
    8953         7197 :        LABEL_NUSES (loop_label) = 1;
    8954              :     }
    8955              : 
    8956              :   /* Copy first desired_align bytes.  */
    8957        10546 :   if (!issetmem)
    8958         7946 :     srcmem = change_address (srcmem, mode, *srcptr);
    8959        10546 :   destmem = change_address (destmem, mode, *destptr);
    8960        10546 :   modesize = GEN_INT (GET_MODE_SIZE (mode));
    8961        21113 :   for (n = 0; prolog_size < desired_align - align; n++)
    8962              :     {
    8963           21 :       if (issetmem)
    8964            3 :         emit_move_insn (destmem, mode_value);
    8965              :       else
    8966              :         {
    8967           18 :           emit_move_insn (destmem, srcmem);
    8968           36 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8969              :         }
    8970           42 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8971           42 :       prolog_size += GET_MODE_SIZE (mode);
    8972              :     }
    8973              : 
    8974              : 
    8975              :   /* Copy last SIZE bytes.  */
    8976        10546 :   destmem = offset_address (destmem, *count, 1);
    8977        10546 :   destmem = offset_address (destmem,
    8978        10546 :                             GEN_INT (-size - prolog_size),
    8979              :                             1);
    8980        10546 :   if (issetmem)
    8981         2600 :     emit_move_insn (destmem, mode_value);
    8982              :   else
    8983              :     {
    8984         7946 :       srcmem = offset_address (srcmem, *count, 1);
    8985         7946 :       srcmem = offset_address (srcmem,
    8986              :                                GEN_INT (-size - prolog_size),
    8987              :                                1);
    8988         7946 :       emit_move_insn (destmem, srcmem);
    8989              :     }
    8990        82628 :   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
    8991              :     {
    8992        30768 :       destmem = offset_address (destmem, modesize, 1);
    8993        30768 :       if (issetmem)
    8994         7599 :         emit_move_insn (destmem, mode_value);
    8995              :       else
    8996              :         {
    8997        23169 :           srcmem = offset_address (srcmem, modesize, 1);
    8998        23169 :           emit_move_insn (destmem, srcmem);
    8999              :         }
    9000              :     }
    9001              : 
    9002              :   /* Align destination.  */
    9003        10546 :   if (desired_align > 1 && desired_align > align)
    9004              :     {
    9005           21 :       rtx saveddest = *destptr;
    9006              : 
    9007           21 :       gcc_assert (desired_align <= size);
    9008              :       /* Align destptr up, place it to new register.  */
    9009           21 :       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
    9010              :                                       GEN_INT (prolog_size),
    9011              :                                       NULL_RTX, 1, OPTAB_DIRECT);
    9012           21 :       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
    9013           21 :         REG_POINTER (*destptr) = 1;
    9014           21 :       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
    9015           21 :                                       GEN_INT (-desired_align),
    9016              :                                       *destptr, 1, OPTAB_DIRECT);
    9017              :       /* See how many bytes we skipped.  */
    9018           21 :       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
    9019              :                                        *destptr,
    9020              :                                        NULL_RTX, 1, OPTAB_DIRECT);
    9021              :       /* Adjust srcptr and count.  */
    9022           21 :       if (!issetmem)
    9023           18 :         *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
    9024              :                                        saveddest, *srcptr, 1, OPTAB_DIRECT);
    9025           21 :       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
    9026              :                                     saveddest, *count, 1, OPTAB_DIRECT);
    9027              :       /* We copied at most size + prolog_size.  */
    9028           21 :       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
    9029           14 :         *min_size
    9030           14 :           = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
    9031              :       else
    9032            7 :         *min_size = 0;
    9033              : 
    9034              :       /* Our loops always round down the block size, but for dispatch to
    9035              :          library we need precise value.  */
    9036           21 :       if (dynamic_check)
    9037           21 :         *count = expand_simple_binop (GET_MODE (*count), AND, *count,
    9038              :                                       GEN_INT (-size), *count, 1, OPTAB_DIRECT);
    9039              :     }
    9040              :   else
    9041              :     {
    9042        10525 :       gcc_assert (prolog_size == 0);
    9043              :       /* Decrease count, so we won't end up copying last word twice.  */
    9044        10525 :       if (!CONST_INT_P (*count))
    9045         7197 :         *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
    9046              :                                       constm1_rtx, *count, 1, OPTAB_DIRECT);
    9047              :       else
    9048         3328 :         *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
    9049              :                                       (unsigned HOST_WIDE_INT)size));
    9050        10525 :       if (*min_size)
    9051         9351 :         *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
    9052              :     }
    9053        10546 : }
    9054              : 
    9055              : 
    9056              : /* This function is like the previous one, except here we know how many bytes
    9057              :    need to be copied.  That allows us to update alignment not only of DST, which
    9058              :    is returned, but also of SRC, which is passed as a pointer for that
    9059              :    reason.  */
    9060              : static rtx
    9061            0 : expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
    9062              :                                            rtx srcreg, rtx value, rtx vec_value,
    9063              :                                            int desired_align, int align_bytes,
    9064              :                                            bool issetmem)
    9065              : {
    9066            0 :   rtx src = NULL;
    9067            0 :   rtx orig_dst = dst;
    9068            0 :   rtx orig_src = NULL;
    9069            0 :   int piece_size = 1;
    9070            0 :   int copied_bytes = 0;
    9071              : 
    9072            0 :   if (!issetmem)
    9073              :     {
    9074            0 :       gcc_assert (srcp != NULL);
    9075            0 :       src = *srcp;
    9076            0 :       orig_src = src;
    9077              :     }
    9078              : 
    9079            0 :   for (piece_size = 1;
    9080            0 :        piece_size <= desired_align && copied_bytes < align_bytes;
    9081            0 :        piece_size <<= 1)
    9082              :     {
    9083            0 :       if (align_bytes & piece_size)
    9084              :         {
    9085            0 :           if (issetmem)
    9086              :             {
    9087            0 :               if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
    9088            0 :                 dst = emit_memset (dst, destreg, vec_value, piece_size);
    9089              :               else
    9090            0 :                 dst = emit_memset (dst, destreg, value, piece_size);
    9091              :             }
    9092              :           else
    9093            0 :             dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
    9094            0 :           copied_bytes += piece_size;
    9095              :         }
    9096              :     }
    9097            0 :   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
    9098            0 :     set_mem_align (dst, desired_align * BITS_PER_UNIT);
    9099            0 :   if (MEM_SIZE_KNOWN_P (orig_dst))
    9100            0 :     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
    9101              : 
    9102            0 :   if (!issetmem)
    9103              :     {
    9104            0 :       int src_align_bytes = get_mem_align_offset (src, desired_align
    9105              :                                                        * BITS_PER_UNIT);
    9106            0 :       if (src_align_bytes >= 0)
    9107            0 :         src_align_bytes = desired_align - src_align_bytes;
    9108            0 :       if (src_align_bytes >= 0)
    9109              :         {
    9110              :           unsigned int src_align;
    9111            0 :           for (src_align = desired_align; src_align >= 2; src_align >>= 1)
    9112              :             {
    9113            0 :               if ((src_align_bytes & (src_align - 1))
    9114            0 :                    == (align_bytes & (src_align - 1)))
    9115              :                 break;
    9116              :             }
    9117            0 :           if (src_align > (unsigned int) desired_align)
    9118              :             src_align = desired_align;
    9119            0 :           if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
    9120            0 :             set_mem_align (src, src_align * BITS_PER_UNIT);
    9121              :         }
    9122            0 :       if (MEM_SIZE_KNOWN_P (orig_src))
    9123            0 :         set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
    9124            0 :       *srcp = src;
    9125              :     }
    9126              : 
    9127            0 :   return dst;
    9128              : }
    9129              : 
    9130              : /* Return true if ALG can be used in current context.
    9131              :    Assume we expand memset if MEMSET is true.  */
    9132              : static bool
    9133       834735 : alg_usable_p (enum stringop_alg alg, bool memset,
    9134              :               addr_space_t dst_as, addr_space_t src_as)
    9135              : {
    9136       834735 :   if (alg == no_stringop)
    9137              :     return false;
    9138              :   /* It is not possible to use a library call if we have non-default
    9139              :      address space.  We can do better than the generic byte-at-a-time
    9140              :      loop, used as a fallback.  */
    9141       834735 :   if (alg == libcall &&
    9142       468505 :       !(ADDR_SPACE_GENERIC_P (dst_as) && ADDR_SPACE_GENERIC_P (src_as)))
    9143              :     return false;
    9144       834728 :   if (alg == vector_loop)
    9145       367949 :     return TARGET_SSE || TARGET_AVX;
    9146              :   /* Algorithms using the rep prefix want at least edi and ecx;
    9147              :      additionally, memset wants eax and memcpy wants esi.  Don't
    9148              :      consider such algorithms if the user has appropriated those
    9149              :      registers for their own purposes, or if we have the destination
    9150              :      in the non-default address space, since string insns cannot
    9151              :      override the destination segment.  */
    9152       650722 :   if (alg == rep_prefix_1_byte
    9153              :       || alg == rep_prefix_4_byte
    9154       650722 :       || alg == rep_prefix_8_byte)
    9155              :     {
    9156        33572 :       if (fixed_regs[CX_REG]
    9157        33568 :           || fixed_regs[DI_REG]
    9158        33564 :           || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])
    9159        33560 :           || !ADDR_SPACE_GENERIC_P (dst_as)
    9160        67132 :           || !(ADDR_SPACE_GENERIC_P (src_as) || Pmode == word_mode))
    9161           12 :         return false;
    9162              :     }
    9163              :   return true;
    9164              : }
    9165              : 
    9166              : /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
    9167              : static enum stringop_alg
    9168       165316 : decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
    9169              :             unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
    9170              :             bool memset, bool zero_memset, addr_space_t dst_as,
    9171              :             addr_space_t src_as, int *dynamic_check, bool *noalign, bool recur)
    9172              : {
    9173       165316 :   const struct stringop_algs *algs;
    9174       165316 :   bool optimize_for_speed;
    9175       165316 :   int max = 0;
    9176       165316 :   const struct processor_costs *cost;
    9177       165316 :   int i;
    9178       165316 :   bool any_alg_usable_p = false;
    9179              : 
    9180       165316 :   *noalign = false;
    9181       165316 :   *dynamic_check = -1;
    9182              : 
    9183              :   /* Even if the string operation call is cold, we still might spend a lot
    9184              :      of time processing large blocks.  */
    9185       165316 :   if (optimize_function_for_size_p (cfun)
    9186       165316 :       || (optimize_insn_for_size_p ()
    9187         9925 :           && (max_size < 256
    9188         3703 :               || (expected_size != -1 && expected_size < 256))))
    9189              :     optimize_for_speed = false;
    9190              :   else
    9191       148598 :     optimize_for_speed = true;
    9192              : 
    9193       148598 :   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
    9194       165316 :   if (memset)
    9195        49023 :     algs = &cost->memset[TARGET_64BIT != 0];
    9196              :   else
    9197       125193 :     algs = &cost->memcpy[TARGET_64BIT != 0];
    9198              : 
    9199              :   /* See maximal size for user defined algorithm.  */
    9200       826580 :   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
    9201              :     {
    9202       661264 :       enum stringop_alg candidate = algs->size[i].alg;
    9203       661264 :       bool usable = alg_usable_p (candidate, memset, dst_as, src_as);
    9204       661264 :       any_alg_usable_p |= usable;
    9205              : 
    9206       661264 :       if (candidate != libcall && candidate && usable)
    9207       313847 :         max = algs->size[i].max;
    9208              :     }
    9209              : 
    9210              :   /* If expected size is not known but max size is small enough
    9211              :      so inline version is a win, set expected size into
    9212              :      the range.  */
    9213       165316 :   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
    9214        35845 :       && expected_size == -1)
    9215        18415 :     expected_size = min_size / 2 + max_size / 2;
    9216              : 
    9217              :   /* If user specified the algorithm, honor it if possible.  */
    9218       165316 :   if (ix86_stringop_alg != no_stringop
    9219       165316 :       && alg_usable_p (ix86_stringop_alg, memset, dst_as, src_as))
    9220              :     return ix86_stringop_alg;
    9221              :   /* rep; movq or rep; movl is the smallest variant.  */
    9222       165204 :   else if (!optimize_for_speed)
    9223              :     {
    9224        16635 :       *noalign = true;
    9225        16635 :       if (!count || (count & 3) || (memset && !zero_memset))
    9226         5914 :         return alg_usable_p (rep_prefix_1_byte, memset, dst_as, src_as)
    9227         5914 :                ? rep_prefix_1_byte : loop_1_byte;
    9228              :       else
    9229        10721 :         return alg_usable_p (rep_prefix_4_byte, memset, dst_as, src_as)
    9230        10721 :                ? rep_prefix_4_byte : loop;
    9231              :     }
    9232              :   /* Very tiny blocks are best handled via the loop, REP is expensive to
    9233              :      setup.  */
    9234       148569 :   else if (expected_size != -1 && expected_size < 4)
    9235              :     return loop_1_byte;
    9236       145648 :   else if (expected_size != -1)
    9237              :     {
    9238              :       enum stringop_alg alg = libcall;
    9239              :       bool alg_noalign = false;
    9240       182143 :       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
    9241              :         {
    9242              :           /* We get here if the algorithms that were not libcall-based
    9243              :              were rep-prefix based and we are unable to use rep prefixes
    9244              :              based on global register usage.  Break out of the loop and
    9245              :              use the heuristic below.  */
    9246       179210 :           if (algs->size[i].max == 0)
    9247              :             break;
    9248       179210 :           if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
    9249              :             {
    9250        75278 :               enum stringop_alg candidate = algs->size[i].alg;
    9251              : 
    9252        75278 :               if (candidate != libcall
    9253        75278 :                   && alg_usable_p (candidate, memset, dst_as, src_as))
    9254              :                 {
    9255        20356 :                   alg = candidate;
    9256        20356 :                   alg_noalign = algs->size[i].noalign;
    9257              :                 }
    9258              :               /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
    9259              :                  last non-libcall inline algorithm.  */
    9260        75278 :               if (TARGET_INLINE_ALL_STRINGOPS)
    9261              :                 {
    9262              :                   /* When the current size is best to be copied by a libcall,
    9263              :                      but we are still forced to inline, run the heuristic below
    9264              :                      that will pick code for medium sized blocks.  */
    9265        10982 :                   if (alg != libcall)
    9266              :                     {
    9267         5107 :                       *noalign = alg_noalign;
    9268         5107 :                       return alg;
    9269              :                     }
    9270         5875 :                   else if (!any_alg_usable_p)
    9271              :                     break;
    9272              :                 }
    9273        64296 :               else if (alg_usable_p (candidate, memset, dst_as, src_as)
    9274        64296 :                        && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
    9275           22 :                             && candidate == rep_prefix_1_byte
    9276              :                             /* NB: If min_size != max_size, size is
    9277              :                                unknown.  */
    9278           22 :                             && min_size != max_size))
    9279              :                 {
    9280        64277 :                   *noalign = algs->size[i].noalign;
    9281        64277 :                   return candidate;
    9282              :                 }
    9283              :             }
    9284              :         }
    9285              :     }
    9286              :   /* When asked to inline the call anyway, try to pick meaningful choice.
    9287              :      We look for maximal size of block that is faster to copy by hand and
    9288              :      take blocks of at most of that size guessing that average size will
    9289              :      be roughly half of the block.
    9290              : 
    9291              :      If this turns out to be bad, we might simply specify the preferred
    9292              :      choice in ix86_costs.  */
    9293        72055 :   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
    9294        76270 :       && (algs->unknown_size == libcall
    9295            0 :           || !alg_usable_p (algs->unknown_size, memset, dst_as, src_as)))
    9296              :     {
    9297         4215 :       enum stringop_alg alg;
    9298         4215 :       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
    9299              : 
    9300              :       /* If there aren't any usable algorithms or if recursing already,
    9301              :          then recursing on smaller sizes or same size isn't going to
    9302              :          find anything.  Just return the simple byte-at-a-time copy loop.  */
    9303         4215 :       if (!any_alg_usable_p || recur)
    9304              :         {
    9305              :           /* Pick something reasonable.  */
    9306            0 :           if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
    9307            0 :             *dynamic_check = 128;
    9308            0 :           return loop_1_byte;
    9309              :         }
    9310         4215 :       alg = decide_alg (count, new_expected_size, min_size, max_size,
    9311              :                         memset, zero_memset, dst_as, src_as,
    9312              :                         dynamic_check, noalign, true);
    9313         4215 :       gcc_assert (*dynamic_check == -1);
    9314         4215 :       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
    9315            8 :         *dynamic_check = max;
    9316              :       else
    9317         4207 :         gcc_assert (alg != libcall);
    9318         4215 :       return alg;
    9319              :     }
    9320              : 
    9321              :   /* Try to use some reasonable fallback algorithm.  Note that for
    9322              :      non-default address spaces we default to a loop instead of
    9323              :      a libcall.  */
    9324              : 
    9325        72049 :   bool have_as = !(ADDR_SPACE_GENERIC_P (dst_as)
    9326              :                    && ADDR_SPACE_GENERIC_P (src_as));
    9327              : 
    9328        72049 :   return (alg_usable_p (algs->unknown_size, memset, dst_as, src_as)
    9329        72049 :           ? algs->unknown_size : have_as ? loop : libcall);
    9330              : }
    9331              : 
    9332              : /* Decide on alignment.  We know that the operand is already aligned to ALIGN
    9333              :    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
    9334              : static int
    9335        34664 : decide_alignment (int align,
    9336              :                   enum stringop_alg alg,
    9337              :                   int expected_size,
    9338              :                   machine_mode move_mode)
    9339              : {
    9340        34664 :   int desired_align = 0;
    9341              : 
    9342        34664 :   gcc_assert (alg != no_stringop);
    9343              : 
    9344        34664 :   if (alg == libcall)
    9345              :     return 0;
    9346        34664 :   if (move_mode == VOIDmode)
    9347              :     return 0;
    9348              : 
    9349        34664 :   desired_align = GET_MODE_SIZE (move_mode);
    9350              :   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
    9351              :      copying whole cacheline at once.  */
    9352        34664 :   if (TARGET_CPU_P (PENTIUMPRO)
    9353            0 :       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
    9354        34664 :     desired_align = 8;
    9355              : 
    9356        34664 :   if (optimize_size)
    9357         9807 :     desired_align = 1;
    9358        34664 :   if (desired_align < align)
    9359              :     desired_align = align;
    9360        34664 :   if (expected_size != -1 && expected_size < 4)
    9361            0 :     desired_align = align;
    9362              : 
    9363              :   return desired_align;
    9364              : }
    9365              : 
    9366              : 
    9367              : /* Helper function for memcpy.  For QImode value 0xXY produce
    9368              :    0xXYXYXYXY of wide specified by MODE.  This is essentially
    9369              :    a * 0x10101010, but we can do slightly better than
    9370              :    synth_mult by unwinding the sequence by hand on CPUs with
    9371              :    slow multiply.  */
    9372              : static rtx
    9373        16550 : promote_duplicated_reg (machine_mode mode, rtx val)
    9374              : {
    9375        16550 :   if (val == const0_rtx)
    9376        14958 :     return copy_to_mode_reg (mode, CONST0_RTX (mode));
    9377              : 
    9378         1592 :   machine_mode valmode = GET_MODE (val);
    9379         1592 :   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
    9380              :     {
    9381              :       /* Duplicate the scalar value for integer vector.  */
    9382         1271 :       gcc_assert ((val == const0_rtx || val == constm1_rtx)
    9383              :                   || GET_MODE_INNER (mode) == valmode);
    9384          647 :       rtx dup = gen_reg_rtx (mode);
    9385          647 :       bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
    9386              :                                                    val);
    9387          647 :       gcc_assert (ok);
    9388              :       return dup;
    9389              :     }
    9390              : 
    9391          945 :   rtx tmp;
    9392          945 :   int nops = mode == DImode ? 3 : 2;
    9393              : 
    9394           40 :   gcc_assert (mode == SImode || mode == DImode);
    9395          945 :   if (CONST_INT_P (val))
    9396              :     {
    9397          656 :       HOST_WIDE_INT v = INTVAL (val) & 255;
    9398              : 
    9399          656 :       v |= v << 8;
    9400          656 :       v |= v << 16;
    9401          656 :       if (mode == DImode)
    9402          628 :         v |= (v << 16) << 16;
    9403          656 :       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
    9404              :     }
    9405              : 
    9406          289 :   if (valmode == VOIDmode)
    9407              :     valmode = QImode;
    9408          289 :   if (valmode != QImode)
    9409            0 :     val = gen_lowpart (QImode, val);
    9410          289 :   if (mode == QImode)
    9411              :     return val;
    9412          289 :   if (!TARGET_PARTIAL_REG_STALL)
    9413          289 :     nops--;
    9414          289 :   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
    9415          289 :       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
    9416          289 :       <= (ix86_cost->shift_const + ix86_cost->add) * nops
    9417          289 :           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
    9418              :     {
    9419          289 :       rtx reg = convert_modes (mode, QImode, val, true);
    9420          289 :       tmp = promote_duplicated_reg (mode, const1_rtx);
    9421          289 :       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
    9422          289 :                                   OPTAB_DIRECT);
    9423              :     }
    9424              :   else
    9425              :     {
    9426            0 :       rtx reg = convert_modes (mode, QImode, val, true);
    9427              : 
    9428            0 :       if (!TARGET_PARTIAL_REG_STALL)
    9429            0 :         emit_insn (gen_insv_1 (mode, reg, reg));
    9430              :       else
    9431              :         {
    9432            0 :           tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
    9433              :                                      NULL, 1, OPTAB_DIRECT);
    9434            0 :           reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
    9435              :                                      OPTAB_DIRECT);
    9436              :         }
    9437            0 :       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
    9438              :                                  NULL, 1, OPTAB_DIRECT);
    9439            0 :       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
    9440            0 :       if (mode == SImode)
    9441              :         return reg;
    9442            0 :       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
    9443              :                                  NULL, 1, OPTAB_DIRECT);
    9444            0 :       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
    9445            0 :       return reg;
    9446              :     }
    9447              : }
    9448              : 
    9449              : /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
    9450              :    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
    9451              :    alignment from ALIGN to DESIRED_ALIGN.  */
    9452              : static rtx
    9453        12327 : promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
    9454              :                                 int align)
    9455              : {
    9456        12327 :   rtx promoted_val;
    9457              : 
    9458        12327 :   if (TARGET_64BIT
    9459        10850 :       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
    9460         4379 :     promoted_val = promote_duplicated_reg (DImode, val);
    9461         7948 :   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
    9462         6137 :     promoted_val = promote_duplicated_reg (SImode, val);
    9463         1811 :   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
    9464            0 :     promoted_val = promote_duplicated_reg (HImode, val);
    9465              :   else
    9466              :     promoted_val = val;
    9467              : 
    9468        12327 :   return promoted_val;
    9469              : }
    9470              : 
    9471              : /* Copy the address to a Pmode register.  This is used for x32 to
    9472              :    truncate DImode TLS address to a SImode register. */
    9473              : 
    9474              : static rtx
    9475        67637 : ix86_copy_addr_to_reg (rtx addr)
    9476              : {
    9477        67637 :   rtx reg;
    9478        72212 :   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
    9479              :     {
    9480        67637 :       reg = copy_addr_to_reg (addr);
    9481        67637 :       REG_POINTER (reg) = 1;
    9482        67637 :       return reg;
    9483              :     }
    9484              :   else
    9485              :     {
    9486            0 :       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
    9487            0 :       reg = copy_to_mode_reg (DImode, addr);
    9488            0 :       REG_POINTER (reg) = 1;
    9489            0 :       return gen_rtx_SUBREG (SImode, reg, 0);
    9490              :     }
    9491              : }
    9492              : 
    9493              : /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
    9494              :    operations when profitable.  The code depends upon architecture, block size
    9495              :    and alignment, but always has one of the following overall structures:
    9496              : 
    9497              :    Aligned move sequence:
    9498              : 
    9499              :      1) Prologue guard: Conditional that jumps up to epilogues for small
    9500              :         blocks that can be handled by epilogue alone.  This is faster
    9501              :         but also needed for correctness, since prologue assume the block
    9502              :         is larger than the desired alignment.
    9503              : 
    9504              :         Optional dynamic check for size and libcall for large
    9505              :         blocks is emitted here too, with -minline-stringops-dynamically.
    9506              : 
    9507              :      2) Prologue: copy first few bytes in order to get destination
    9508              :         aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
    9509              :         than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
    9510              :         copied.  We emit either a jump tree on power of two sized
    9511              :         blocks, or a byte loop.
    9512              : 
    9513              :      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
    9514              :         with specified algorithm.
    9515              : 
    9516              :      4) Epilogue: code copying tail of the block that is too small to be
    9517              :         handled by main body (or up to size guarded by prologue guard).
    9518              : 
    9519              :   Misaligned move sequence
    9520              : 
    9521              :      1) missaligned move prologue/epilogue containing:
    9522              :         a) Prologue handling small memory blocks and jumping to done_label
    9523              :            (skipped if blocks are known to be large enough)
    9524              :         b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
    9525              :            needed by single possibly misaligned move
    9526              :            (skipped if alignment is not needed)
    9527              :         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
    9528              : 
    9529              :      2) Zero size guard dispatching to done_label, if needed
    9530              : 
    9531              :      3) dispatch to library call, if needed,
    9532              : 
    9533              :      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
    9534              :         with specified algorithm.  */
    9535              : bool
    9536       147811 : ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
    9537              :                            rtx align_exp, rtx expected_align_exp,
    9538              :                            rtx expected_size_exp, rtx min_size_exp,
    9539              :                            rtx max_size_exp, rtx probable_max_size_exp,
    9540              :                            bool issetmem)
    9541              : {
    9542       147811 :   rtx destreg;
    9543       147811 :   rtx srcreg = NULL;
    9544       147811 :   rtx_code_label *label = NULL;
    9545       147811 :   rtx tmp;
    9546       147811 :   rtx_code_label *jump_around_label = NULL;
    9547       147811 :   HOST_WIDE_INT align = 1;
    9548       147811 :   unsigned HOST_WIDE_INT count = 0;
    9549       147811 :   HOST_WIDE_INT expected_size = -1;
    9550       147811 :   int size_needed = 0, epilogue_size_needed;
    9551       147811 :   int desired_align = 0, align_bytes = 0;
    9552       147811 :   enum stringop_alg alg;
    9553       147811 :   rtx promoted_val = NULL;
    9554       147811 :   rtx vec_promoted_val = NULL;
    9555       147811 :   bool force_loopy_epilogue = false;
    9556       147811 :   int dynamic_check;
    9557       147811 :   bool need_zero_guard = false;
    9558       147811 :   bool noalign;
    9559       147811 :   machine_mode move_mode = VOIDmode;
    9560       147811 :   int unroll_factor = 1;
    9561              :   /* TODO: Once value ranges are available, fill in proper data.  */
    9562       147811 :   unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
    9563       147811 :   unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U;
    9564       147811 :   unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
    9565       147811 :   bool misaligned_prologue_used = false;
    9566       147811 :   addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC;
    9567              : 
    9568       147811 :   if (CONST_INT_P (align_exp))
    9569       147811 :     align = INTVAL (align_exp);
    9570              :   /* i386 can do misaligned access on reasonably increased cost.  */
    9571       147811 :   if (CONST_INT_P (expected_align_exp)
    9572       147811 :       && INTVAL (expected_align_exp) > align)
    9573              :     align = INTVAL (expected_align_exp);
    9574              :   /* ALIGN is the minimum of destination and source alignment, but we care here
    9575              :      just about destination alignment.  */
    9576       140739 :   else if (!issetmem
    9577       237060 :            && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
    9578         3269 :     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
    9579              : 
    9580       147811 :   if (CONST_INT_P (count_exp))
    9581              :     {
    9582        67525 :       min_size = max_size = probable_max_size = count = expected_size
    9583        67525 :         = INTVAL (count_exp);
    9584              :       /* When COUNT is 0, there is nothing to do.  */
    9585        67525 :       if (!count)
    9586              :         return true;
    9587              :     }
    9588              :   else
    9589              :     {
    9590        80286 :       if (min_size_exp)
    9591        80286 :         min_size = INTVAL (min_size_exp);
    9592        80286 :       if (max_size_exp)
    9593        70035 :         max_size = INTVAL (max_size_exp);
    9594        80286 :       if (probable_max_size_exp)
    9595        72071 :         probable_max_size = INTVAL (probable_max_size_exp);
    9596        80286 :       if (CONST_INT_P (expected_size_exp))
    9597        80286 :         expected_size = INTVAL (expected_size_exp);
    9598              :      }
    9599              : 
    9600              :   /* Make sure we don't need to care about overflow later on.  */
    9601       147809 :   if (count > (HOST_WIDE_INT_1U << 30))
    9602              :     return false;
    9603              : 
    9604       147635 :   dst_as = MEM_ADDR_SPACE (dst);
    9605       147635 :   if (!issetmem)
    9606       103279 :     src_as = MEM_ADDR_SPACE (src);
    9607              : 
    9608              :   /* Step 0: Decide on preferred algorithm, desired alignment and
    9609              :      size of chunks to be copied by main loop.  */
    9610       147635 :   alg = decide_alg (count, expected_size, min_size, probable_max_size,
    9611        44356 :                     issetmem, issetmem && val_exp == const0_rtx,
    9612              :                     dst_as, src_as, &dynamic_check, &noalign, false);
    9613              : 
    9614       147635 :   if (dump_file)
    9615            7 :     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
    9616            7 :              stringop_alg_names[alg]);
    9617              : 
    9618       147635 :   if (alg == libcall)
    9619              :     return false;
    9620        34664 :   gcc_assert (alg != no_stringop);
    9621              : 
    9622        34664 :   if (!count)
    9623        16271 :     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
    9624        34664 :   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
    9625        34664 :   if (!issetmem)
    9626        22337 :     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
    9627              : 
    9628        34664 :   bool aligned_dstmem = false;
    9629        34664 :   unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
    9630        34664 :   bool single_insn_p = count && count <= nunits;
    9631        34664 :   if (single_insn_p)
    9632              :     {
    9633              :       /* If it can be done with a single instruction, use vector
    9634              :          instruction and don't align destination.  */
    9635            6 :       alg = vector_loop;
    9636            6 :       noalign = true;
    9637            6 :       dynamic_check = -1;
    9638              :     }
    9639              : 
    9640        34664 :   unroll_factor = 1;
    9641        34664 :   move_mode = word_mode;
    9642        34664 :   switch (alg)
    9643              :     {
    9644            0 :     case libcall:
    9645            0 :     case no_stringop:
    9646            0 :     case last_alg:
    9647            0 :       gcc_unreachable ();
    9648         1703 :     case loop_1_byte:
    9649         1703 :       need_zero_guard = true;
    9650         1703 :       move_mode = QImode;
    9651         1703 :       break;
    9652           51 :     case loop:
    9653           51 :       need_zero_guard = true;
    9654           51 :       break;
    9655           20 :     case unrolled_loop:
    9656           20 :       need_zero_guard = true;
    9657           20 :       unroll_factor = (TARGET_64BIT ? 4 : 2);
    9658              :       break;
    9659        16230 :     case vector_loop:
    9660        16230 :       need_zero_guard = true;
    9661        16230 :       unroll_factor = 4;
    9662              :       /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes.  */
    9663        16230 :       nunits /= GET_MODE_SIZE (word_mode);
    9664        16230 :       if (nunits > 1)
    9665              :         {
    9666        16226 :           move_mode = mode_for_vector (word_mode, nunits).require ();
    9667        16226 :           gcc_assert (optab_handler (mov_optab, move_mode)
    9668              :                       != CODE_FOR_nothing);
    9669              :         }
    9670              :       break;
    9671           25 :     case rep_prefix_8_byte:
    9672           25 :       move_mode = DImode;
    9673           25 :       break;
    9674        10716 :     case rep_prefix_4_byte:
    9675        10716 :       move_mode = SImode;
    9676        10716 :       break;
    9677         5919 :     case rep_prefix_1_byte:
    9678         5919 :       move_mode = QImode;
    9679         5919 :       break;
    9680              :     }
    9681        34664 :   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
    9682        34664 :   epilogue_size_needed = size_needed;
    9683              : 
    9684              :   /* If we are going to call any library calls conditionally, make sure any
    9685              :      pending stack adjustment happen before the first conditional branch,
    9686              :      otherwise they will be emitted before the library call only and won't
    9687              :      happen from the other branches.  */
    9688        34664 :   if (dynamic_check != -1)
    9689            7 :     do_pending_stack_adjust ();
    9690              : 
    9691        34664 :   desired_align = decide_alignment (align, alg, expected_size, move_mode);
    9692        34664 :   if (!TARGET_ALIGN_STRINGOPS || noalign)
    9693        32874 :     align = desired_align;
    9694              : 
    9695              :   /* Step 1: Prologue guard.  */
    9696              : 
    9697              :   /* Alignment code needs count to be in register.  */
    9698        34664 :   if (CONST_INT_P (count_exp) && desired_align > align)
    9699              :     {
    9700           20 :       if (INTVAL (count_exp) > desired_align
    9701           20 :           && INTVAL (count_exp) > size_needed)
    9702              :         {
    9703           20 :           align_bytes
    9704           20 :             = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
    9705           20 :           if (align_bytes <= 0)
    9706              :             align_bytes = 0;
    9707              :           else
    9708            0 :             align_bytes = desired_align - align_bytes;
    9709              :         }
    9710            0 :       if (align_bytes == 0)
    9711           40 :         count_exp = force_reg (counter_mode (count_exp), count_exp);
    9712              :     }
    9713        34664 :   gcc_assert (desired_align >= 1 && align >= 1);
    9714              : 
    9715        34664 :   if (!single_insn_p)
    9716              :     {
    9717              :       /* Misaligned move sequences handle both prologue and epilogue
    9718              :          at once.  Default code generation results in a smaller code
    9719              :          for large alignments and also avoids redundant job when sizes
    9720              :          are known precisely.  */
    9721        34658 :       misaligned_prologue_used
    9722        69316 :         = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
    9723        34652 :            && MAX (desired_align, epilogue_size_needed) <= 32
    9724        18101 :            && desired_align <= epilogue_size_needed
    9725        40802 :            && ((desired_align > align && !align_bytes)
    9726         6123 :                || (!count && epilogue_size_needed > 1)));
    9727              : 
    9728              :       /* Destination is aligned after the misaligned prologue.  */
    9729        34658 :       aligned_dstmem = misaligned_prologue_used;
    9730              : 
    9731        34658 :       if (noalign && !misaligned_prologue_used)
    9732              :         {
    9733              :           /* Also use misaligned prologue if alignment isn't needed and
    9734              :              destination isn't aligned.   Since alignment isn't needed,
    9735              :              the destination after prologue won't be aligned.  */
    9736        32868 :           aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
    9737        32868 :                             <= MEM_ALIGN (dst));
    9738        32868 :           if (!aligned_dstmem)
    9739        10525 :             misaligned_prologue_used = true;
    9740              :         }
    9741              :     }
    9742              : 
    9743              :   /* Do the cheap promotion to allow better CSE across the
    9744              :      main loop and epilogue (ie one load of the big constant in the
    9745              :      front of all code.
    9746              :      For now the misaligned move sequences do not have fast path
    9747              :      without broadcasting.  */
    9748        34664 :   if (issetmem
    9749        12327 :       && (alg == vector_loop
    9750         6582 :           || CONST_INT_P (val_exp)
    9751           48 :           || misaligned_prologue_used))
    9752              :     {
    9753         6534 :       if (alg == vector_loop)
    9754              :         {
    9755         5745 :           promoted_val = promote_duplicated_reg_to_size (val_exp,
    9756        11490 :                                                          GET_MODE_SIZE (word_mode),
    9757              :                                                          desired_align, align);
    9758              :           /* Duplicate the promoted scalar value if not 0 nor -1.  */
    9759         5745 :           vec_promoted_val
    9760         5745 :             = promote_duplicated_reg (move_mode,
    9761         5745 :                                       (val_exp == const0_rtx
    9762          647 :                                        || val_exp == constm1_rtx)
    9763              :                                       ? val_exp : promoted_val);
    9764              :         }
    9765              :       else
    9766              :         {
    9767         6534 :           promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
    9768              :                                                          desired_align, align);
    9769              :         }
    9770              :     }
    9771              :   /* Misaligned move sequences handles both prologues and epilogues at once.
    9772              :      Default code generation results in smaller code for large alignments and
    9773              :      also avoids redundant job when sizes are known precisely.  */
    9774        34616 :   if (misaligned_prologue_used)
    9775              :     {
    9776              :       /* Misaligned move prologue handled small blocks by itself.  */
    9777        10546 :       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
    9778        10546 :            (dst, src, &destreg, &srcreg,
    9779              :             move_mode, promoted_val, vec_promoted_val,
    9780              :             &count_exp,
    9781              :             &jump_around_label,
    9782        10546 :             desired_align < align
    9783            0 :             ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
    9784              :             desired_align, align, &min_size, dynamic_check, issetmem);
    9785        10546 :       if (!issetmem)
    9786         7946 :         src = change_address (src, BLKmode, srcreg);
    9787        10546 :       dst = change_address (dst, BLKmode, destreg);
    9788        10546 :       if (aligned_dstmem)
    9789           21 :         set_mem_align (dst, desired_align * BITS_PER_UNIT);
    9790        10546 :       epilogue_size_needed = 0;
    9791        10546 :       if (need_zero_guard
    9792        10268 :           && min_size < (unsigned HOST_WIDE_INT) size_needed)
    9793              :         {
    9794              :           /* It is possible that we copied enough so the main loop will not
    9795              :              execute.  */
    9796         7245 :           gcc_assert (size_needed > 1);
    9797         7245 :           if (jump_around_label == NULL_RTX)
    9798           50 :             jump_around_label = gen_label_rtx ();
    9799        14490 :           emit_cmp_and_jump_insns (count_exp,
    9800              :                                    GEN_INT (size_needed),
    9801              :                                    LTU, 0, counter_mode (count_exp), 1, jump_around_label);
    9802         7245 :           if (expected_size == -1
    9803           56 :               || expected_size < (desired_align - align) / 2 + size_needed)
    9804         7190 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9805              :           else
    9806           55 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9807              :         }
    9808              :     }
    9809              :   /* Ensure that alignment prologue won't copy past end of block.  */
    9810        24118 :   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
    9811              :     {
    9812        16496 :       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
    9813              :       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
    9814              :          Make sure it is power of 2.  */
    9815        16496 :       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
    9816              : 
    9817              :       /* To improve performance of small blocks, we jump around the VAL
    9818              :          promoting mode.  This mean that if the promoted VAL is not constant,
    9819              :          we might not use it in the epilogue and have to use byte
    9820              :          loop variant.  */
    9821        16496 :       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
    9822        16496 :         force_loopy_epilogue = true;
    9823        16496 :       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9824        16488 :           || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9825              :         {
    9826              :           /* If main algorithm works on QImode, no epilogue is needed.
    9827              :              For small sizes just don't align anything.  */
    9828         2237 :           if (size_needed == 1)
    9829            0 :             desired_align = align;
    9830              :           else
    9831         2237 :             goto epilogue;
    9832              :         }
    9833        14259 :       else if (!count
    9834          256 :                && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9835              :         {
    9836          255 :           label = gen_label_rtx ();
    9837          510 :           emit_cmp_and_jump_insns (count_exp,
    9838              :                                    GEN_INT (epilogue_size_needed),
    9839              :                                    LTU, 0, counter_mode (count_exp), 1, label);
    9840          255 :           if (expected_size == -1 || expected_size < epilogue_size_needed)
    9841          255 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9842              :           else
    9843            0 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9844              :         }
    9845              :     }
    9846              : 
    9847              :   /* Emit code to decide on runtime whether library call or inline should be
    9848              :      used.  */
    9849        32427 :   if (dynamic_check != -1)
    9850              :     {
    9851            7 :       if (!issetmem && CONST_INT_P (count_exp))
    9852              :         {
    9853            1 :           if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
    9854              :             {
    9855            1 :               emit_block_copy_via_libcall (dst, src, count_exp);
    9856            1 :               count_exp = const0_rtx;
    9857            1 :               goto epilogue;
    9858              :             }
    9859              :         }
    9860              :       else
    9861              :         {
    9862            6 :           rtx_code_label *hot_label = gen_label_rtx ();
    9863            6 :           if (jump_around_label == NULL_RTX)
    9864            1 :             jump_around_label = gen_label_rtx ();
    9865           12 :           emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
    9866              :                                    LEU, 0, counter_mode (count_exp),
    9867              :                                    1, hot_label);
    9868            6 :           predict_jump (REG_BR_PROB_BASE * 90 / 100);
    9869            6 :           if (issetmem)
    9870            4 :             set_storage_via_libcall (dst, count_exp, val_exp);
    9871              :           else
    9872            2 :             emit_block_copy_via_libcall (dst, src, count_exp);
    9873            6 :           emit_jump (jump_around_label);
    9874            6 :           emit_label (hot_label);
    9875              :         }
    9876              :     }
    9877              : 
    9878              :   /* Step 2: Alignment prologue.  */
    9879              :   /* Do the expensive promotion once we branched off the small blocks.  */
    9880        32426 :   if (issetmem && !promoted_val)
    9881           48 :     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
    9882              :                                                    desired_align, align);
    9883              : 
    9884        32426 :   if (desired_align > align && !misaligned_prologue_used)
    9885              :     {
    9886            7 :       if (align_bytes == 0)
    9887              :         {
    9888              :           /* Except for the first move in prologue, we no longer know
    9889              :              constant offset in aliasing info.  It don't seems to worth
    9890              :              the pain to maintain it for the first move, so throw away
    9891              :              the info early.  */
    9892            7 :           dst = change_address (dst, BLKmode, destreg);
    9893            7 :           if (!issetmem)
    9894            5 :             src = change_address (src, BLKmode, srcreg);
    9895            7 :           dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
    9896              :                                             promoted_val, vec_promoted_val,
    9897              :                                             count_exp, align, desired_align,
    9898              :                                             issetmem);
    9899              :           /* At most desired_align - align bytes are copied.  */
    9900            7 :           if (min_size < (unsigned)(desired_align - align))
    9901            0 :             min_size = 0;
    9902              :           else
    9903            7 :             min_size -= desired_align - align;
    9904              :         }
    9905              :       else
    9906              :         {
    9907              :           /* If we know how many bytes need to be stored before dst is
    9908              :              sufficiently aligned, maintain aliasing info accurately.  */
    9909            0 :           dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
    9910              :                                                            srcreg,
    9911              :                                                            promoted_val,
    9912              :                                                            vec_promoted_val,
    9913              :                                                            desired_align,
    9914              :                                                            align_bytes,
    9915              :                                                            issetmem);
    9916              : 
    9917            0 :           count_exp = plus_constant (counter_mode (count_exp),
    9918            0 :                                      count_exp, -align_bytes);
    9919            0 :           count -= align_bytes;
    9920            0 :           min_size -= align_bytes;
    9921            0 :           max_size -= align_bytes;
    9922              :         }
    9923            7 :       if (need_zero_guard
    9924            7 :           && min_size < (unsigned HOST_WIDE_INT) size_needed
    9925            1 :           && (count < (unsigned HOST_WIDE_INT) size_needed
    9926            0 :               || (align_bytes == 0
    9927            0 :                   && count < ((unsigned HOST_WIDE_INT) size_needed
    9928            0 :                               + desired_align - align))))
    9929              :         {
    9930              :           /* It is possible that we copied enough so the main loop will not
    9931              :              execute.  */
    9932            1 :           gcc_assert (size_needed > 1);
    9933            1 :           if (label == NULL_RTX)
    9934            0 :             label = gen_label_rtx ();
    9935            2 :           emit_cmp_and_jump_insns (count_exp,
    9936              :                                    GEN_INT (size_needed),
    9937              :                                    LTU, 0, counter_mode (count_exp), 1, label);
    9938            1 :           if (expected_size == -1
    9939            0 :               || expected_size < (desired_align - align) / 2 + size_needed)
    9940            1 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9941              :           else
    9942            0 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9943              :         }
    9944              :     }
    9945        32426 :   if (label && size_needed == 1)
    9946              :     {
    9947            0 :       emit_label (label);
    9948            0 :       LABEL_NUSES (label) = 1;
    9949            0 :       label = NULL;
    9950            0 :       epilogue_size_needed = 1;
    9951            0 :       if (issetmem)
    9952            0 :         promoted_val = val_exp;
    9953              :     }
    9954        32426 :   else if (label == NULL_RTX && !misaligned_prologue_used)
    9955        21626 :     epilogue_size_needed = size_needed;
    9956              : 
    9957              :   /* Step 3: Main loop.  */
    9958              : 
    9959        32426 :   switch (alg)
    9960              :     {
    9961            0 :     case libcall:
    9962            0 :     case no_stringop:
    9963            0 :     case last_alg:
    9964            0 :       gcc_unreachable ();
    9965         1774 :     case loop_1_byte:
    9966         1774 :     case loop:
    9967         1774 :     case unrolled_loop:
    9968         1774 :       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
    9969              :                                      count_exp, move_mode, unroll_factor,
    9970              :                                      expected_size, issetmem);
    9971         1774 :       break;
    9972        13992 :     case vector_loop:
    9973        13992 :       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
    9974              :                                      vec_promoted_val, count_exp, move_mode,
    9975              :                                      unroll_factor, expected_size, issetmem);
    9976        13992 :       break;
    9977        16660 :     case rep_prefix_8_byte:
    9978        16660 :     case rep_prefix_4_byte:
    9979        16660 :     case rep_prefix_1_byte:
    9980        16660 :       expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
    9981              :                                        val_exp, count_exp, move_mode, issetmem);
    9982        16660 :       break;
    9983              :     }
    9984              :   /* Adjust properly the offset of src and dest memory for aliasing.  */
    9985        32426 :   if (CONST_INT_P (count_exp))
    9986              :     {
    9987        18364 :       if (!issetmem)
    9988         8556 :         src = adjust_automodify_address_nv (src, BLKmode, srcreg,
    9989              :                                             (count / size_needed) * size_needed);
    9990        18364 :       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
    9991              :                                           (count / size_needed) * size_needed);
    9992              :     }
    9993              :   else
    9994              :     {
    9995        14062 :       if (!issetmem)
    9996        11819 :         src = change_address (src, BLKmode, srcreg);
    9997        14062 :       dst = change_address (dst, BLKmode, destreg);
    9998              :     }
    9999              : 
   10000              :   /* Step 4: Epilogue to copy the remaining bytes.  */
   10001        34664 :  epilogue:
   10002        34664 :   if (label)
   10003              :     {
   10004              :       /* When the main loop is done, COUNT_EXP might hold original count,
   10005              :          while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
   10006              :          Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
   10007              :          bytes. Compensate if needed.  */
   10008              : 
   10009          255 :       if (size_needed < epilogue_size_needed)
   10010              :         {
   10011            0 :           tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
   10012            0 :                                      GEN_INT (size_needed - 1), count_exp, 1,
   10013              :                                      OPTAB_DIRECT);
   10014            0 :           if (tmp != count_exp)
   10015            0 :             emit_move_insn (count_exp, tmp);
   10016              :         }
   10017          255 :       emit_label (label);
   10018          255 :       LABEL_NUSES (label) = 1;
   10019              :     }
   10020              : 
   10021        34664 :   if (count_exp != const0_rtx && epilogue_size_needed > 1)
   10022              :     {
   10023        16496 :       if (force_loopy_epilogue)
   10024            0 :         expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
   10025              :                                          epilogue_size_needed);
   10026              :       else
   10027              :         {
   10028        16496 :           if (issetmem)
   10029         7916 :             expand_setmem_epilogue (dst, destreg, promoted_val,
   10030              :                                     vec_promoted_val, count_exp,
   10031              :                                     epilogue_size_needed);
   10032              :           else
   10033         8580 :             expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
   10034              :                                     epilogue_size_needed);
   10035              :         }
   10036              :     }
   10037        34664 :   if (jump_around_label)
   10038         7248 :     emit_label (jump_around_label);
   10039              :   return true;
   10040              : }
   10041              : 
   10042              : /* Fully unroll memmove of known size with up to 8 registers.  */
   10043              : 
   10044              : static bool
   10045         1873 : ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg,
   10046              :                            unsigned HOST_WIDE_INT count,
   10047              :                            machine_mode mode)
   10048              : {
   10049              :   /* If 8 registers registers can cover all memory, load them into
   10050              :      registers and store them together to avoid possible address
   10051              :      overlap between source and destination.  */
   10052         1873 :   unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode);
   10053         1873 :   if (moves == 0)
   10054              :     {
   10055            0 :       mode = smallest_int_mode_for_size
   10056            0 :         (count * BITS_PER_UNIT).require ();
   10057            0 :       if (count == GET_MODE_SIZE (mode))
   10058              :         moves = 1;
   10059              :       else
   10060              :         {
   10061              :           /* Reduce the smallest move size by half so that MOVES == 1.  */
   10062            0 :           mode = smallest_int_mode_for_size
   10063            0 :             (GET_MODE_BITSIZE (mode) / 2).require ();
   10064            0 :           moves = count / GET_MODE_SIZE (mode);
   10065            0 :           gcc_assert (moves == 1);
   10066              :         }
   10067              :     }
   10068         1873 :   else if (moves > 8)
   10069              :     return false;
   10070              : 
   10071         1864 :   unsigned int i;
   10072         1864 :   rtx tmp[9];
   10073              : 
   10074         4296 :   for (i = 0; i < moves; i++)
   10075         2432 :     tmp[i] = gen_reg_rtx (mode);
   10076              : 
   10077         1864 :   rtx srcmem = change_address (src, mode, srcreg);
   10078         6160 :   for (i = 0; i < moves; i++)
   10079              :     {
   10080         2432 :       emit_move_insn (tmp[i], srcmem);
   10081         4864 :       srcmem = offset_address (srcmem,
   10082         2432 :                                GEN_INT (GET_MODE_SIZE (mode)),
   10083         2432 :                                GET_MODE_SIZE (mode));
   10084              :     }
   10085              : 
   10086         1864 :   unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1);
   10087         1864 :   machine_mode epilogue_mode = VOIDmode;
   10088         1864 :   if (epilogue_size)
   10089              :     {
   10090              :       /* Handle the remaining bytes with overlapping move.  */
   10091         1691 :       epilogue_mode = smallest_int_mode_for_size
   10092         1691 :         (epilogue_size * BITS_PER_UNIT).require ();
   10093         1691 :       tmp[8] = gen_reg_rtx (epilogue_mode);
   10094         1691 :       srcmem = adjust_address (srcmem, epilogue_mode, 0);
   10095         1691 :       srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1);
   10096         3382 :       srcmem = offset_address (srcmem,
   10097         1691 :                                GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
   10098         1691 :                                GET_MODE_SIZE (epilogue_mode));
   10099         1691 :       emit_move_insn (tmp[8], srcmem);
   10100              :     }
   10101              : 
   10102         1864 :   rtx destmem = change_address (dst, mode, destreg);
   10103         6160 :   for (i = 0; i < moves; i++)
   10104              :     {
   10105         2432 :       emit_move_insn (destmem, tmp[i]);
   10106         4864 :       destmem = offset_address (destmem,
   10107         2432 :                                 GEN_INT (GET_MODE_SIZE (mode)),
   10108         2432 :                                 GET_MODE_SIZE (mode));
   10109              :     }
   10110              : 
   10111         1864 :   if (epilogue_size)
   10112              :     {
   10113              :       /* Use overlapping move.  */
   10114         1691 :       destmem = adjust_address (destmem, epilogue_mode, 0);
   10115         1691 :       destmem = offset_address (destmem, GEN_INT (epilogue_size), 1);
   10116         3382 :       destmem = offset_address (destmem,
   10117         1691 :                                 GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
   10118         1691 :                                 GET_MODE_SIZE (epilogue_mode));
   10119         1691 :       emit_move_insn (destmem, tmp[8]);
   10120              :     }
   10121              : 
   10122              :   return true;
   10123              : }
   10124              : 
   10125              : /* Expand memmove of size with MOVES * mode size and MOVES <= 4.  If
   10126              :    FORWARD is true, copy forward.  Otherwise copy backward.  */
   10127              : 
   10128              : static void
   10129         2298 : ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode,
   10130              :                            unsigned int moves, bool forward)
   10131              : {
   10132         2298 :   gcc_assert (moves <= 4);
   10133              : 
   10134              :   unsigned int i;
   10135              :   rtx tmp[8];
   10136              : 
   10137        11490 :   for (i = 0; i < moves; i++)
   10138         9192 :     tmp[i] = gen_reg_rtx (mode);
   10139              : 
   10140         2298 :   rtx step;
   10141         2298 :   if (forward)
   10142         2298 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10143              :   else
   10144         2298 :     step = GEN_INT (-GET_MODE_SIZE (mode));
   10145              : 
   10146              :   /* Load MOVES.  */
   10147         9192 :   for (i = 0; i < moves - 1; i++)
   10148              :     {
   10149         6894 :       emit_move_insn (tmp[i], srcmem);
   10150        13788 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10151              :     }
   10152         2298 :   emit_move_insn (tmp[i], srcmem);
   10153              : 
   10154              :   /* Store MOVES.  */
   10155        11490 :   for (i = 0; i < moves - 1; i++)
   10156              :     {
   10157         6894 :       emit_move_insn (destmem, tmp[i]);
   10158        13788 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10159              :     }
   10160         2298 :   emit_move_insn (destmem, tmp[i]);
   10161         2298 : }
   10162              : 
   10163              : /* Load MOVES of mode size into REGS.  If LAST is true, load the
   10164              :    last MOVES.  Otherwise, load the first MOVES.  */
   10165              : 
   10166              : static void
   10167         2298 : ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp,
   10168              :                          machine_mode mode, unsigned int moves,
   10169              :                          rtx regs[], bool last)
   10170              : {
   10171         2298 :   unsigned int i;
   10172              : 
   10173        11490 :   for (i = 0; i < moves; i++)
   10174         9192 :     regs[i] = gen_reg_rtx (mode);
   10175              : 
   10176         2298 :   rtx srcmem = change_address (src, mode, srcreg);
   10177         2298 :   rtx step;
   10178         2298 :   if (last)
   10179              :     {
   10180         1149 :       srcmem = offset_address (srcmem, count_exp, 1);
   10181         2298 :       step = GEN_INT (-GET_MODE_SIZE (mode));
   10182         2298 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10183              :     }
   10184              :   else
   10185         2298 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10186              : 
   10187         9192 :   for (i = 0; i < moves - 1; i++)
   10188              :     {
   10189         6894 :       emit_move_insn (regs[i], srcmem);
   10190        13788 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10191              :     }
   10192         2298 :   emit_move_insn (regs[i], srcmem);
   10193         2298 : }
   10194              : 
   10195              : /* Store MOVES of mode size into REGS.  If LAST is true, store the
   10196              :    last MOVES.  Otherwise, store the first MOVES.  */
   10197              : 
   10198              : static void
   10199         2298 : ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp,
   10200              :                           machine_mode mode, unsigned int moves,
   10201              :                           rtx regs[], bool last)
   10202              : {
   10203         2298 :   unsigned int i;
   10204              : 
   10205         2298 :   rtx destmem = change_address (dst, mode, destreg);
   10206         2298 :   rtx step;
   10207         2298 :   if (last)
   10208              :     {
   10209         1149 :       destmem = offset_address (destmem, count_exp, 1);
   10210         2298 :       step = GEN_INT (-GET_MODE_SIZE (mode));
   10211         2298 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10212              :     }
   10213              :   else
   10214         2298 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10215              : 
   10216         9192 :   for (i = 0; i < moves - 1; i++)
   10217              :     {
   10218         6894 :       emit_move_insn (destmem, regs[i]);
   10219        13788 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10220              :     }
   10221         2298 :   emit_move_insn (destmem, regs[i]);
   10222         2298 : }
   10223              : 
   10224              : /* Expand memmove of size between (MOVES / 2) * mode size and
   10225              :    MOVES * mode size with overlapping load and store.  MOVES is even.
   10226              :    MOVES >= 2 and MOVES <= 8.  */
   10227              : 
   10228              : static void
   10229        12538 : ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg,
   10230              :                                        rtx srcreg, rtx count_exp,
   10231              :                                        machine_mode mode,
   10232              :                                        unsigned int moves)
   10233              : {
   10234        12538 :   gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0);
   10235              : 
   10236        12538 :   unsigned int half_moves = moves / 2;
   10237        12538 :   unsigned int i, j;
   10238        12538 :   rtx tmp[8];
   10239              : 
   10240        47890 :   for (i = 0; i < moves; i++)
   10241        35352 :     tmp[i] = gen_reg_rtx (mode);
   10242              : 
   10243        12538 :   rtx base_srcmem = change_address (src, mode, srcreg);
   10244              : 
   10245              :   /* Load the first half.  */
   10246        12538 :   rtx srcmem = base_srcmem;
   10247        30214 :   for (i = 0; i < half_moves - 1; i++)
   10248              :     {
   10249         5138 :       emit_move_insn (tmp[i], srcmem);
   10250        10276 :       srcmem = offset_address (srcmem,
   10251         5138 :                                GEN_INT (GET_MODE_SIZE (mode)),
   10252         5138 :                                GET_MODE_SIZE (mode));
   10253              :     }
   10254        12538 :   emit_move_insn (tmp[i], srcmem);
   10255              : 
   10256              :   /* Load the second half.  */
   10257        12538 :   srcmem = offset_address (base_srcmem, count_exp, 1);
   10258        12538 :   srcmem = offset_address (srcmem,
   10259        12538 :                            GEN_INT (-GET_MODE_SIZE (mode)),
   10260        12538 :                            GET_MODE_SIZE (mode));
   10261        30214 :   for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
   10262              :     {
   10263         5138 :       emit_move_insn (tmp[j], srcmem);
   10264        10276 :       srcmem = offset_address (srcmem,
   10265         5138 :                                GEN_INT (-GET_MODE_SIZE (mode)),
   10266         5138 :                                GET_MODE_SIZE (mode));
   10267              :     }
   10268        12538 :   emit_move_insn (tmp[j], srcmem);
   10269              : 
   10270        12538 :   rtx base_destmem = change_address (dst, mode, destreg);
   10271              : 
   10272              :   /* Store the first half.  */
   10273        12538 :   rtx destmem = base_destmem;
   10274        30214 :   for (i = 0; i < half_moves - 1; i++)
   10275              :     {
   10276         5138 :       emit_move_insn (destmem, tmp[i]);
   10277        10276 :       destmem = offset_address (destmem,
   10278         5138 :                                 GEN_INT (GET_MODE_SIZE (mode)),
   10279         5138 :                                 GET_MODE_SIZE (mode));
   10280              :     }
   10281        12538 :   emit_move_insn (destmem, tmp[i]);
   10282              : 
   10283              :   /* Store the second half.  */
   10284        12538 :   destmem = offset_address (base_destmem, count_exp, 1);
   10285        25076 :   destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
   10286        12538 :                             GET_MODE_SIZE (mode));
   10287        30214 :   for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
   10288              :     {
   10289         5138 :       emit_move_insn (destmem, tmp[j]);
   10290        10276 :       destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
   10291         5138 :                                 GET_MODE_SIZE (mode));
   10292              :     }
   10293        12538 :   emit_move_insn (destmem, tmp[j]);
   10294        12538 : }
   10295              : 
   10296              : /* Expand memmove of size < mode size which is <= 64.  */
   10297              : 
   10298              : static void
   10299         2814 : ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg,
   10300              :                               rtx srcreg, rtx count_exp,
   10301              :                               unsigned HOST_WIDE_INT min_size,
   10302              :                               machine_mode mode,
   10303              :                               rtx_code_label *done_label)
   10304              : {
   10305         2814 :   bool skip = false;
   10306         2814 :   machine_mode count_mode = counter_mode (count_exp);
   10307              : 
   10308         2814 :   rtx_code_label *between_32_63_label
   10309         2814 :     = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr;
   10310              :   /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64.  */
   10311            3 :   if (between_32_63_label)
   10312              :     {
   10313            3 :       if (min_size && min_size >= 32)
   10314              :         {
   10315            1 :           emit_jump_insn (gen_jump (between_32_63_label));
   10316            1 :           emit_barrier ();
   10317            1 :           skip = true;
   10318              :         }
   10319              :       else
   10320            2 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU,
   10321              :                                  nullptr, count_mode, 1,
   10322              :                                  between_32_63_label);
   10323              :     }
   10324              : 
   10325            3 :   rtx_code_label *between_16_31_label
   10326         2813 :     = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr;
   10327              :   /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31.  */
   10328            4 :   if (between_16_31_label)
   10329              :     {
   10330            4 :       if (min_size && min_size >= 16)
   10331              :         {
   10332            2 :           emit_jump_insn (gen_jump (between_16_31_label));
   10333            2 :           emit_barrier ();
   10334            2 :           skip = true;
   10335              :         }
   10336              :       else
   10337            2 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU,
   10338              :                                  nullptr, count_mode, 1,
   10339              :                                  between_16_31_label);
   10340              :     }
   10341              : 
   10342            2 :   rtx_code_label *between_8_15_label
   10343         5623 :     = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr;
   10344              :   /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15.  */
   10345         1895 :   if (between_8_15_label)
   10346              :     {
   10347         1895 :       if (min_size && min_size >= 8)
   10348              :         {
   10349          147 :           emit_jump_insn (gen_jump (between_8_15_label));
   10350          147 :           emit_barrier ();
   10351          147 :           skip = true;
   10352              :         }
   10353              :       else
   10354         1748 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU,
   10355              :                                  nullptr, count_mode, 1,
   10356              :                                  between_8_15_label);
   10357              :     }
   10358              : 
   10359          147 :   rtx_code_label *between_4_7_label
   10360         5331 :     = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr;
   10361              :   /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7.  */
   10362         2131 :   if (between_4_7_label)
   10363              :     {
   10364         2131 :       if (min_size && min_size >= 4)
   10365              :         {
   10366          152 :           emit_jump_insn (gen_jump (between_4_7_label));
   10367          152 :           emit_barrier ();
   10368          152 :           skip = true;
   10369              :         }
   10370              :       else
   10371         1979 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU,
   10372              :                                  nullptr, count_mode, 1,
   10373              :                                  between_4_7_label);
   10374              :     }
   10375              : 
   10376          152 :   rtx_code_label *between_2_3_label
   10377         5174 :     = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr;
   10378              :   /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3.  */
   10379         2366 :   if (between_2_3_label)
   10380              :     {
   10381         2366 :       if (min_size && min_size >= 2)
   10382              :         {
   10383          128 :           emit_jump_insn (gen_jump (between_2_3_label));
   10384          128 :           emit_barrier ();
   10385          128 :           skip = true;
   10386              :         }
   10387              :       else
   10388         2238 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT,
   10389              :                                  nullptr, count_mode, 1,
   10390              :                                  between_2_3_label);
   10391              :     }
   10392              : 
   10393         2814 :   if (!skip)
   10394              :     {
   10395         2384 :       rtx_code_label *zero_label
   10396         2384 :         = min_size == 0 ? gen_label_rtx () : nullptr;
   10397              :       /* Skip if size == 0.  */
   10398         1556 :       if (zero_label)
   10399         1556 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT,
   10400              :                                  nullptr, count_mode, 1,
   10401              :                                  zero_label,
   10402              :                                  profile_probability::unlikely ());
   10403              : 
   10404              :       /* Move 1 byte.  */
   10405         2384 :       rtx tmp0 = gen_reg_rtx (QImode);
   10406         2384 :       rtx srcmem = change_address (src, QImode, srcreg);
   10407         2384 :       emit_move_insn (tmp0, srcmem);
   10408         2384 :       rtx destmem = change_address (dst, QImode, destreg);
   10409         2384 :       emit_move_insn (destmem, tmp0);
   10410              : 
   10411         2384 :       if (zero_label)
   10412         1556 :         emit_label (zero_label);
   10413              : 
   10414         2384 :       emit_jump_insn (gen_jump (done_label));
   10415         2384 :       emit_barrier ();
   10416              :     }
   10417              : 
   10418         2814 :   if (between_32_63_label)
   10419              :     {
   10420            3 :       emit_label (between_32_63_label);
   10421            3 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10422              :                                              count_exp, OImode, 2);
   10423            3 :       emit_jump_insn (gen_jump (done_label));
   10424            3 :       emit_barrier ();
   10425              :     }
   10426              : 
   10427         2814 :   if (between_16_31_label)
   10428              :     {
   10429            4 :       emit_label (between_16_31_label);
   10430            4 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10431              :                                              count_exp, TImode, 2);
   10432            4 :       emit_jump_insn (gen_jump (done_label));
   10433            4 :       emit_barrier ();
   10434              :     }
   10435              : 
   10436         2814 :   if (between_8_15_label)
   10437              :     {
   10438         1895 :       emit_label (between_8_15_label);
   10439         1895 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10440              :                                              count_exp, DImode, 2);
   10441         1895 :       emit_jump_insn (gen_jump (done_label));
   10442         1895 :       emit_barrier ();
   10443              :     }
   10444              : 
   10445         2814 :   if (between_4_7_label)
   10446              :     {
   10447         2131 :       emit_label (between_4_7_label);
   10448         2131 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10449              :                                              count_exp, SImode, 2);
   10450         2131 :       emit_jump_insn (gen_jump (done_label));
   10451         2131 :       emit_barrier ();
   10452              :     }
   10453              : 
   10454         2814 :   if (between_2_3_label)
   10455              :     {
   10456         2366 :       emit_label (between_2_3_label);
   10457         2366 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10458              :                                              count_exp, HImode, 2);
   10459         2366 :       emit_jump_insn (gen_jump (done_label));
   10460         2366 :       emit_barrier ();
   10461              :     }
   10462         2814 : }
   10463              : 
   10464              : /* Expand movmem with overlapping unaligned loads and stores:
   10465              :    1. Load all sources into registers and store them together to avoid
   10466              :       possible address overlap between source and destination.
   10467              :    2. For known size, first try to fully unroll with 8 registers.
   10468              :    3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first
   10469              :       and then store them together.
   10470              :    4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources
   10471              :       into 4 registers first and then store them together.
   10472              :    5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources
   10473              :       into 8 registers first and then store them together.
   10474              :    6. For size > 8 * MOVE_MAX,
   10475              :       a. If address of destination > address of source, copy backward
   10476              :          with a 4 * MOVE_MAX loop with unaligned loads and stores.  Load
   10477              :          the first 4 * MOVE_MAX into 4 registers before the loop and
   10478              :          store them after the loop to support overlapping addresses.
   10479              :       b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned
   10480              :          loads and stores.  Load the last 4 * MOVE_MAX into 4 registers
   10481              :          before the loop and store them after the loop to support
   10482              :          overlapping addresses.
   10483              :  */
   10484              : 
   10485              : bool
   10486        17325 : ix86_expand_movmem (rtx operands[])
   10487              : {
   10488              :   /* Since there are much less registers available in 32-bit mode, don't
   10489              :      inline movmem in 32-bit mode.  */
   10490        17325 :   if (!TARGET_64BIT || optimize_insn_for_size_p ())
   10491         3828 :     return false;
   10492              : 
   10493        13497 :   rtx dst = operands[0];
   10494        13497 :   rtx src = operands[1];
   10495        13497 :   rtx count_exp = operands[2];
   10496        13497 :   rtx expected_size_exp = operands[5];
   10497        13497 :   rtx min_size_exp = operands[6];
   10498        13497 :   rtx probable_max_size_exp = operands[8];
   10499        13497 :   unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U;
   10500        13497 :   HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U;
   10501        13497 :   unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
   10502        13497 :   unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
   10503              : 
   10504        13497 :   if (CONST_INT_P (count_exp))
   10505              :     {
   10506         2008 :       min_size = probable_max_size = count = expected_size
   10507         2008 :         = INTVAL (count_exp);
   10508              :       /* When COUNT is 0, there is nothing to do.  */
   10509         2008 :       if (!count)
   10510              :         return true;
   10511              :     }
   10512              :   else
   10513              :     {
   10514        11489 :       if (min_size_exp)
   10515        11489 :         min_size = INTVAL (min_size_exp);
   10516        11489 :       if (probable_max_size_exp)
   10517         8761 :         probable_max_size = INTVAL (probable_max_size_exp);
   10518        11489 :       if (CONST_INT_P (expected_size_exp))
   10519        11489 :         expected_size = INTVAL (expected_size_exp);
   10520              :      }
   10521              : 
   10522              :   /* Make sure we don't need to care about overflow later on.  */
   10523        13497 :   if (count > (HOST_WIDE_INT_1U << 30))
   10524              :     return false;
   10525              : 
   10526        13466 :   addr_space_t dst_as = MEM_ADDR_SPACE (dst);
   10527        13466 :   addr_space_t src_as = MEM_ADDR_SPACE (src);
   10528        13466 :   int dynamic_check;
   10529        13466 :   bool noalign;
   10530        13466 :   enum stringop_alg alg = decide_alg (count, expected_size, min_size,
   10531              :                                       probable_max_size, false, false,
   10532              :                                       dst_as, src_as, &dynamic_check,
   10533              :                                       &noalign, false);
   10534        13466 :   if (alg == libcall)
   10535              :     return false;
   10536              : 
   10537         5318 :   rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
   10538         5318 :   rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
   10539              : 
   10540         5318 :   unsigned int move_max = MOVE_MAX;
   10541         5318 :   machine_mode mode = smallest_int_mode_for_size
   10542         5318 :     (move_max * BITS_PER_UNIT).require ();
   10543         5318 :   if (probable_max_size && probable_max_size < move_max)
   10544              :     {
   10545              :       /* Get a usable MOVE_MAX.  */
   10546         2899 :       mode = smallest_int_mode_for_size
   10547         2899 :         (probable_max_size * BITS_PER_UNIT).require ();
   10548              :       /* Reduce MOVE_MAX by half so that MOVE_MAX can be used.  */
   10549         5798 :       if (GET_MODE_SIZE (mode) > probable_max_size)
   10550         2414 :         mode = smallest_int_mode_for_size
   10551         2414 :           (GET_MODE_BITSIZE (mode) / 2).require ();
   10552         5798 :       move_max = GET_MODE_SIZE (mode);
   10553              :     }
   10554              : 
   10555              :   /* Try to fully unroll memmove of known size first.  */
   10556         5318 :   if (count
   10557         5318 :       && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count,
   10558              :                                     mode))
   10559              :     return true;
   10560              : 
   10561         3454 :   rtx_code_label *done_label = gen_label_rtx ();
   10562              : 
   10563         3454 :   rtx_code_label *less_vec_label = nullptr;
   10564         3454 :   if (min_size == 0 || min_size < move_max)
   10565         2814 :     less_vec_label = gen_label_rtx ();
   10566              : 
   10567         3454 :   machine_mode count_mode = counter_mode (count_exp);
   10568              : 
   10569              :   /* Jump to LESS_VEC_LABEL if size < MOVE_MAX.  */
   10570         3454 :   if (less_vec_label)
   10571         2814 :     emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU,
   10572              :                              nullptr, count_mode, 1,
   10573              :                              less_vec_label);
   10574              : 
   10575         3454 :   rtx_code_label *more_2x_vec_label = nullptr;
   10576         3454 :   if (probable_max_size == 0 || probable_max_size > 2 * move_max)
   10577         1501 :     more_2x_vec_label = gen_label_rtx ();
   10578              : 
   10579              :   /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX.  */
   10580         1501 :   if (more_2x_vec_label)
   10581         1501 :     emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU,
   10582              :                              nullptr, count_mode, 1,
   10583              :                              more_2x_vec_label);
   10584              : 
   10585         3454 :   if (min_size == 0 || min_size <= 2 * move_max)
   10586              :     {
   10587              :       /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX.  */
   10588         3433 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10589              :                                              count_exp, mode, 2);
   10590         3433 :       emit_jump_insn (gen_jump (done_label));
   10591         3433 :       emit_barrier ();
   10592              :     }
   10593              : 
   10594         3454 :   if (less_vec_label)
   10595              :     {
   10596              :       /* Size < MOVE_MAX.  */
   10597         2814 :       emit_label (less_vec_label);
   10598         2814 :       ix86_expand_less_move_movmem (dst, src, destreg, srcreg,
   10599              :                                     count_exp, min_size, mode,
   10600              :                                     done_label);
   10601         2814 :       emit_jump_insn (gen_jump (done_label));
   10602         2814 :       emit_barrier ();
   10603              :     }
   10604              : 
   10605         3454 :   if (more_2x_vec_label)
   10606              :     {
   10607              :       /* Size > 2 * MOVE_MAX and destination may overlap with source.  */
   10608         1501 :       emit_label (more_2x_vec_label);
   10609              : 
   10610         1501 :       rtx_code_label *more_8x_vec_label = nullptr;
   10611         1501 :       if (probable_max_size == 0 || probable_max_size > 8 * move_max)
   10612         1149 :         more_8x_vec_label = gen_label_rtx ();
   10613              : 
   10614              :       /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX.  */
   10615         1149 :       if (more_8x_vec_label)
   10616         1149 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU,
   10617              :                                  nullptr, count_mode, 1,
   10618              :                                  more_8x_vec_label);
   10619              : 
   10620         1501 :       rtx_code_label *last_4x_vec_label = nullptr;
   10621         1501 :       if (min_size == 0 || min_size <= 4 * move_max)
   10622         1490 :         last_4x_vec_label = gen_label_rtx ();
   10623              : 
   10624              :       /* Jump to LAST_4X_VEC_LABEL if size <= 4 * MOVE_MAX.  */
   10625         1490 :       if (last_4x_vec_label)
   10626         1490 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LEU,
   10627              :                                  nullptr, count_mode, 1,
   10628              :                                  last_4x_vec_label);
   10629              : 
   10630         1501 :       if (probable_max_size == 0 || probable_max_size > 4 * move_max)
   10631              :         {
   10632              :           /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX.  */
   10633         1216 :           ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
   10634              :                                                  srcreg, count_exp,
   10635              :                                                  mode, 8);
   10636         1216 :           emit_jump_insn (gen_jump (done_label));
   10637         1216 :           emit_barrier ();
   10638              :         }
   10639              : 
   10640         1501 :       if (last_4x_vec_label)
   10641              :         {
   10642              :           /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX.  */
   10643         1490 :           emit_label (last_4x_vec_label);
   10644         1490 :           ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
   10645              :                                                  srcreg, count_exp,
   10646              :                                                  mode, 4);
   10647         1490 :           emit_jump_insn (gen_jump (done_label));
   10648         1490 :           emit_barrier ();
   10649              :         }
   10650              : 
   10651         1501 :       if (more_8x_vec_label)
   10652              :         {
   10653              :           /* Size > 8 * MOVE_MAX.  */
   10654         1149 :           emit_label (more_8x_vec_label);
   10655              : 
   10656         1149 :           rtx loop_count = gen_reg_rtx (count_mode);
   10657         1149 :           emit_move_insn (loop_count, count_exp);
   10658              : 
   10659              :           /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is
   10660              :              lower than destination address.  */
   10661         1149 :           rtx_code_label *more_8x_vec_backward_label = gen_label_rtx ();
   10662         1149 :           emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr,
   10663         1149 :                                    GET_MODE (destreg), 1,
   10664              :                                    more_8x_vec_backward_label);
   10665              : 
   10666              :           /* Skip if source == destination which is less common.  */
   10667         1149 :           emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr,
   10668         1149 :                                    GET_MODE (destreg), 1, done_label,
   10669              :                                    profile_probability::unlikely ());
   10670              : 
   10671         1149 :           rtx base_destreg = gen_reg_rtx (GET_MODE (destreg));
   10672         1149 :           emit_move_insn (base_destreg, destreg);
   10673              : 
   10674              :           /* Load the last 4 * MOVE_MAX.  */
   10675         1149 :           rtx regs[4];
   10676         1149 :           ix86_expand_load_movmem (src, srcreg, count_exp, mode,
   10677              :                                    ARRAY_SIZE (regs), regs, true);
   10678              : 
   10679         1149 :           rtx srcmem = change_address (src, mode, srcreg);
   10680         1149 :           rtx destmem = change_address (dst, mode, destreg);
   10681              : 
   10682              :           /* Copy forward with a 4 * MOVE_MAX loop.  */
   10683         1149 :           rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx ();
   10684         1149 :           emit_label (loop_4x_vec_forward_label);
   10685              : 
   10686         1149 :           ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true);
   10687              : 
   10688         1149 :           rtx tmp;
   10689         1149 :           rtx delta = GEN_INT (4 * MOVE_MAX);
   10690              : 
   10691              :           /* Decrement LOOP_COUNT by 4 * MOVE_MAX.  */
   10692         1149 :           tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
   10693              :                                      loop_count, delta, nullptr, 1,
   10694              :                                      OPTAB_DIRECT);
   10695         1149 :           if (tmp != loop_count)
   10696         1149 :             emit_move_insn (loop_count, tmp);
   10697              : 
   10698              :           /* Increment DESTREG and SRCREG by 4 * MOVE_MAX.  */
   10699         1149 :           tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
   10700              :                                      destreg, delta, nullptr, 1,
   10701              :                                      OPTAB_DIRECT);
   10702         1149 :           if (tmp != destreg)
   10703         1149 :             emit_move_insn (destreg, tmp);
   10704         1149 :           tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
   10705              :                                      delta, nullptr, 1, OPTAB_DIRECT);
   10706         1149 :           if (tmp != srcreg)
   10707         1149 :             emit_move_insn (srcreg, tmp);
   10708              : 
   10709              :           /* Stop if LOOP_EXP <= 4 * MOVE_MAX.  */
   10710         1149 :           emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
   10711         1149 :                                    GET_MODE (loop_count), 1,
   10712              :                                    loop_4x_vec_forward_label);
   10713              : 
   10714              :           /* Store the last 4 * MOVE_MAX.  */
   10715         1149 :           ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
   10716              :                                     ARRAY_SIZE (regs), regs, true);
   10717              : 
   10718         1149 :           emit_jump_insn (gen_jump (done_label));
   10719         1149 :           emit_barrier ();
   10720              : 
   10721              :           /* Copy backward with a 4 * MOVE_MAX loop.  */
   10722         1149 :           emit_label (more_8x_vec_backward_label);
   10723              : 
   10724         1149 :           base_destreg = gen_reg_rtx (GET_MODE (destreg));
   10725         1149 :           emit_move_insn (base_destreg, destreg);
   10726              : 
   10727              :           /* Load the first 4 * MOVE_MAX.  */
   10728         1149 :           ix86_expand_load_movmem (src, srcreg, count_exp, mode,
   10729              :                                    ARRAY_SIZE (regs), regs, false);
   10730              : 
   10731              :           /* Increment DESTREG and SRCREG by COUNT_EXP.  */
   10732         1149 :           tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
   10733              :                                      destreg, count_exp, nullptr, 1,
   10734              :                                      OPTAB_DIRECT);
   10735         1149 :           if (tmp != destreg)
   10736         1149 :             emit_move_insn (destreg, tmp);
   10737         1149 :           tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
   10738              :                                      count_exp, nullptr, 1, OPTAB_DIRECT);
   10739         1149 :           if (tmp != srcreg)
   10740         1149 :             emit_move_insn (srcreg, tmp);
   10741              : 
   10742         1149 :           srcmem = change_address (src, mode, srcreg);
   10743         1149 :           destmem = change_address (dst, mode, destreg);
   10744         2298 :           rtx step = GEN_INT (-GET_MODE_SIZE (mode));
   10745         2298 :           srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10746         2298 :           destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10747              : 
   10748         1149 :           rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx ();
   10749         1149 :           emit_label (loop_4x_vec_backward_label);
   10750              : 
   10751         1149 :           ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false);
   10752              : 
   10753              :           /* Decrement LOOP_COUNT by 4 * MOVE_MAX.  */
   10754         1149 :           tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
   10755              :                                      loop_count, delta, nullptr, 1,
   10756              :                                      OPTAB_DIRECT);
   10757         1149 :           if (tmp != loop_count)
   10758         1149 :             emit_move_insn (loop_count, tmp);
   10759              : 
   10760              :           /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX.  */
   10761         1149 :           tmp = expand_simple_binop (GET_MODE (destreg), MINUS,
   10762              :                                      destreg, delta, nullptr, 1,
   10763              :                                      OPTAB_DIRECT);
   10764         1149 :           if (tmp != destreg)
   10765         1149 :             emit_move_insn (destreg, tmp);
   10766         1149 :           tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg,
   10767              :                                      delta, nullptr, 1, OPTAB_DIRECT);
   10768         1149 :           if (tmp != srcreg)
   10769         1149 :             emit_move_insn (srcreg, tmp);
   10770              : 
   10771              :           /* Stop if LOOP_EXP <= 4 * MOVE_MAX.  */
   10772         1149 :           emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
   10773         1149 :                                    GET_MODE (loop_count), 1,
   10774              :                                    loop_4x_vec_backward_label);
   10775              : 
   10776              :           /* Store the first 4 * MOVE_MAX.  */
   10777         1149 :           ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
   10778              :                                     ARRAY_SIZE (regs), regs, false);
   10779              : 
   10780         1149 :           emit_jump_insn (gen_jump (done_label));
   10781         1149 :           emit_barrier ();
   10782              :         }
   10783              :     }
   10784              : 
   10785         3454 :   emit_label (done_label);
   10786              : 
   10787         3454 :   return true;
   10788              : }
   10789              : 
   10790              : /* Expand cmpstrn or memcmp.  */
   10791              : 
   10792              : bool
   10793       170798 : ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
   10794              :                                rtx length, rtx align, bool is_cmpstrn)
   10795              : {
   10796              :   /* Expand strncmp and memcmp only with -minline-all-stringops since
   10797              :      "repz cmpsb" can be much slower than strncmp and memcmp functions
   10798              :      implemented with vector instructions, see
   10799              : 
   10800              :      https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
   10801              :    */
   10802       170798 :   if (!TARGET_INLINE_ALL_STRINGOPS)
   10803              :     return false;
   10804              : 
   10805              :   /* Can't use this if the user has appropriated ecx, esi or edi.  */
   10806         5796 :   if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
   10807              :     return false;
   10808              : 
   10809         5796 :   if (is_cmpstrn)
   10810              :     {
   10811              :       /* For strncmp, length is the maximum length, which can be larger
   10812              :          than actual string lengths.  We can expand the cmpstrn pattern
   10813              :          to "repz cmpsb" only if one of the strings is a constant so
   10814              :          that expand_builtin_strncmp() can write the length argument to
   10815              :          be the minimum of the const string length and the actual length
   10816              :          argument.  Otherwise, "repz cmpsb" may pass the 0 byte.  */
   10817           69 :       tree t1 = MEM_EXPR (src1);
   10818           69 :       tree t2 = MEM_EXPR (src2);
   10819          138 :       if (!((t1 && TREE_CODE (t1) == MEM_REF
   10820           69 :              && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
   10821            0 :              && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
   10822              :                  == STRING_CST))
   10823           69 :             || (t2 && TREE_CODE (t2) == MEM_REF
   10824           69 :                 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
   10825           69 :                 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
   10826              :                     == STRING_CST))))
   10827              :         return false;
   10828              :     }
   10829              : 
   10830         5796 :   rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
   10831         5796 :   rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
   10832         5796 :   if (addr1 != XEXP (src1, 0))
   10833         5796 :     src1 = replace_equiv_address_nv (src1, addr1);
   10834         5796 :   if (addr2 != XEXP (src2, 0))
   10835         5796 :     src2 = replace_equiv_address_nv (src2, addr2);
   10836              : 
   10837              :   /* NB: Make a copy of the data length to avoid changing the original
   10838              :      data length by cmpstrnqi patterns.  */
   10839         5796 :   length = ix86_zero_extend_to_Pmode (length);
   10840         8711 :   rtx lengthreg = gen_reg_rtx (Pmode);
   10841         5796 :   emit_move_insn (lengthreg, length);
   10842              : 
   10843              :   /* If we are testing strict equality, we can use known alignment to
   10844              :      good advantage.  This may be possible with combine, particularly
   10845              :      once cc0 is dead.  */
   10846         5796 :   if (CONST_INT_P (length))
   10847              :     {
   10848            0 :       if (length == const0_rtx)
   10849              :         {
   10850            0 :           emit_move_insn (result, const0_rtx);
   10851            0 :           return true;
   10852              :         }
   10853            0 :       emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
   10854              :                                      src1, src2));
   10855              :     }
   10856              :   else
   10857              :     {
   10858         8711 :       emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
   10859         5796 :       emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
   10860              :                                   src1, src2));
   10861              :     }
   10862              : 
   10863         5796 :   rtx out = gen_lowpart (QImode, result);
   10864         5796 :   emit_insn (gen_cmpintqi (out));
   10865         5796 :   emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
   10866              : 
   10867         5796 :   return true;
   10868              : }
   10869              : 
   10870              : /* Expand the appropriate insns for doing strlen if not just doing
   10871              :    repnz; scasb
   10872              : 
   10873              :    out = result, initialized with the start address
   10874              :    align_rtx = alignment of the address.
   10875              :    scratch = scratch register, initialized with the startaddress when
   10876              :         not aligned, otherwise undefined
   10877              : 
   10878              :    This is just the body. It needs the initializations mentioned above and
   10879              :    some address computing at the end.  These things are done in i386.md.  */
   10880              : 
   10881              : static void
   10882           11 : ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
   10883              : {
   10884           11 :   int align;
   10885           11 :   rtx tmp;
   10886           11 :   rtx_code_label *align_2_label = NULL;
   10887           11 :   rtx_code_label *align_3_label = NULL;
   10888           11 :   rtx_code_label *align_4_label = gen_label_rtx ();
   10889           11 :   rtx_code_label *end_0_label = gen_label_rtx ();
   10890           11 :   rtx mem;
   10891           11 :   rtx tmpreg = gen_reg_rtx (SImode);
   10892           11 :   rtx scratch = gen_reg_rtx (SImode);
   10893           11 :   rtx cmp;
   10894              : 
   10895           11 :   align = 0;
   10896           11 :   if (CONST_INT_P (align_rtx))
   10897           11 :     align = INTVAL (align_rtx);
   10898              : 
   10899              :   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
   10900              : 
   10901              :   /* Is there a known alignment and is it less than 4?  */
   10902           11 :   if (align < 4)
   10903              :     {
   10904           15 :       rtx scratch1 = gen_reg_rtx (Pmode);
   10905           11 :       emit_move_insn (scratch1, out);
   10906              :       /* Is there a known alignment and is it not 2? */
   10907           11 :       if (align != 2)
   10908              :         {
   10909           11 :           align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
   10910           11 :           align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
   10911              : 
   10912              :           /* Leave just the 3 lower bits.  */
   10913           15 :           align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
   10914              :                                     NULL_RTX, 0, OPTAB_WIDEN);
   10915              : 
   10916           15 :           emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
   10917           11 :                                    Pmode, 1, align_4_label);
   10918           15 :           emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
   10919           11 :                                    Pmode, 1, align_2_label);
   10920           15 :           emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
   10921           11 :                                    Pmode, 1, align_3_label);
   10922              :         }
   10923              :       else
   10924              :         {
   10925              :           /* Since the alignment is 2, we have to check 2 or 0 bytes;
   10926              :              check if is aligned to 4 - byte.  */
   10927              : 
   10928            0 :           align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
   10929              :                                     NULL_RTX, 0, OPTAB_WIDEN);
   10930              : 
   10931            0 :           emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
   10932            0 :                                    Pmode, 1, align_4_label);
   10933              :         }
   10934              : 
   10935           11 :       mem = change_address (src, QImode, out);
   10936              : 
   10937              :       /* Now compare the bytes.  */
   10938              : 
   10939              :       /* Compare the first n unaligned byte on a byte per byte basis.  */
   10940           11 :       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
   10941              :                                QImode, 1, end_0_label);
   10942              : 
   10943              :       /* Increment the address.  */
   10944           11 :       emit_insn (gen_add2_insn (out, const1_rtx));
   10945              : 
   10946              :       /* Not needed with an alignment of 2 */
   10947           11 :       if (align != 2)
   10948              :         {
   10949           11 :           emit_label (align_2_label);
   10950              : 
   10951           11 :           emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
   10952              :                                    end_0_label);
   10953              : 
   10954           11 :           emit_insn (gen_add2_insn (out, const1_rtx));
   10955              : 
   10956           11 :           emit_label (align_3_label);
   10957              :         }
   10958              : 
   10959           11 :       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
   10960              :                                end_0_label);
   10961              : 
   10962           11 :       emit_insn (gen_add2_insn (out, const1_rtx));
   10963              :     }
   10964              : 
   10965              :   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
   10966              :      align this loop.  It gives only huge programs, but does not help to
   10967              :      speed up.  */
   10968           11 :   emit_label (align_4_label);
   10969              : 
   10970           11 :   mem = change_address (src, SImode, out);
   10971           11 :   emit_move_insn (scratch, mem);
   10972           11 :   emit_insn (gen_add2_insn (out, GEN_INT (4)));
   10973              : 
   10974              :   /* This formula yields a nonzero result iff one of the bytes is zero.
   10975              :      This saves three branches inside loop and many cycles.  */
   10976              : 
   10977           11 :   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
   10978           11 :   emit_insn (gen_one_cmplsi2 (scratch, scratch));
   10979           11 :   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
   10980           11 :   emit_insn (gen_andsi3 (tmpreg, tmpreg,
   10981              :                          gen_int_mode (0x80808080, SImode)));
   10982           11 :   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
   10983              :                            align_4_label);
   10984              : 
   10985           11 :   if (TARGET_CMOVE)
   10986              :     {
   10987           11 :        rtx reg = gen_reg_rtx (SImode);
   10988           15 :        rtx reg2 = gen_reg_rtx (Pmode);
   10989           11 :        emit_move_insn (reg, tmpreg);
   10990           11 :        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
   10991              : 
   10992              :        /* If zero is not in the first two bytes, move two bytes forward.  */
   10993           11 :        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
   10994           11 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   10995           11 :        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
   10996           11 :        emit_insn (gen_rtx_SET (tmpreg,
   10997              :                                gen_rtx_IF_THEN_ELSE (SImode, tmp,
   10998              :                                                      reg,
   10999              :                                                      tmpreg)));
   11000              :        /* Emit lea manually to avoid clobbering of flags.  */
   11001           15 :        emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
   11002              : 
   11003           11 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   11004           11 :        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
   11005           15 :        emit_insn (gen_rtx_SET (out,
   11006              :                                gen_rtx_IF_THEN_ELSE (Pmode, tmp,
   11007              :                                                      reg2,
   11008              :                                                      out)));
   11009           11 :     }
   11010              :   else
   11011              :     {
   11012            0 :        rtx_code_label *end_2_label = gen_label_rtx ();
   11013              :        /* Is zero in the first two bytes? */
   11014              : 
   11015            0 :        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
   11016            0 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   11017            0 :        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
   11018            0 :        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   11019              :                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
   11020              :                             pc_rtx);
   11021            0 :        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   11022            0 :        JUMP_LABEL (tmp) = end_2_label;
   11023              : 
   11024              :        /* Not in the first two.  Move two bytes forward.  */
   11025            0 :        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
   11026            0 :        emit_insn (gen_add2_insn (out, const2_rtx));
   11027              : 
   11028            0 :        emit_label (end_2_label);
   11029              : 
   11030              :     }
   11031              : 
   11032              :   /* Avoid branch in fixing the byte.  */
   11033           11 :   tmpreg = gen_lowpart (QImode, tmpreg);
   11034           11 :   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
   11035           11 :   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
   11036           11 :   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
   11037           15 :   emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
   11038              : 
   11039           11 :   emit_label (end_0_label);
   11040           11 : }
   11041              : 
   11042              : /* Expand strlen.  */
   11043              : 
   11044              : bool
   11045        13880 : ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
   11046              : {
   11047        13880 : if (TARGET_UNROLL_STRLEN
   11048        13880 :            && TARGET_INLINE_ALL_STRINGOPS
   11049           11 :            && eoschar == const0_rtx
   11050           11 :            && optimize > 1)
   11051              :     {
   11052              :       /* The generic case of strlen expander is long.  Avoid it's
   11053              :          expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
   11054           15 :       rtx addr = force_reg (Pmode, XEXP (src, 0));
   11055              :       /* Well it seems that some optimizer does not combine a call like
   11056              :          foo(strlen(bar), strlen(bar));
   11057              :          when the move and the subtraction is done here.  It does calculate
   11058              :          the length just once when these instructions are done inside of
   11059              :          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
   11060              :          often used and I use one fewer register for the lifetime of
   11061              :          output_strlen_unroll() this is better.  */
   11062              : 
   11063           11 :       emit_move_insn (out, addr);
   11064              : 
   11065           11 :       ix86_expand_strlensi_unroll_1 (out, src, align);
   11066              : 
   11067              :       /* strlensi_unroll_1 returns the address of the zero at the end of
   11068              :          the string, like memchr(), so compute the length by subtracting
   11069              :          the start address.  */
   11070           11 :       emit_insn (gen_sub2_insn (out, addr));
   11071           11 :       return true;
   11072              :     }
   11073              :   else
   11074              :     return false;
   11075              : }
   11076              : 
   11077              : /* For given symbol (function) construct code to compute address of it's PLT
   11078              :    entry in large x86-64 PIC model.  */
   11079              : 
   11080              : static rtx
   11081           34 : construct_plt_address (rtx symbol)
   11082              : {
   11083           34 :   rtx tmp, unspec;
   11084              : 
   11085           34 :   gcc_assert (SYMBOL_REF_P (symbol));
   11086           34 :   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
   11087           34 :   gcc_assert (Pmode == DImode);
   11088              : 
   11089           34 :   tmp = gen_reg_rtx (Pmode);
   11090           34 :   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
   11091              : 
   11092           34 :   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
   11093           34 :   emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
   11094           34 :   return tmp;
   11095              : }
   11096              : 
   11097              : /* Additional registers that are clobbered by SYSV calls.  */
   11098              : 
   11099              : static int const x86_64_ms_sysv_extra_clobbered_registers
   11100              :                  [NUM_X86_64_MS_CLOBBERED_REGS] =
   11101              : {
   11102              :   SI_REG, DI_REG,
   11103              :   XMM6_REG, XMM7_REG,
   11104              :   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
   11105              :   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
   11106              : };
   11107              : 
   11108              : rtx_insn *
   11109      6240966 : ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
   11110              :                   rtx callarg2,
   11111              :                   rtx pop, bool sibcall)
   11112              : {
   11113      6240966 :   rtx vec[3];
   11114      6240966 :   rtx use = NULL, call;
   11115      6240966 :   unsigned int vec_len = 0;
   11116      6240966 :   tree fndecl;
   11117      6240966 :   bool call_no_callee_saved_registers = false;
   11118              : 
   11119      6240966 :   if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
   11120              :     {
   11121      6057196 :       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
   11122      6057196 :       if (fndecl)
   11123              :         {
   11124      5797169 :           if (lookup_attribute ("interrupt",
   11125      5797169 :                                 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
   11126            1 :             error ("interrupt service routine cannot be called directly");
   11127      5797168 :           else if (ix86_type_no_callee_saved_registers_p (TREE_TYPE (fndecl)))
   11128      5797169 :             call_no_callee_saved_registers = true;
   11129      5797169 :           if (fndecl == current_function_decl
   11130      5797169 :               && decl_binds_to_current_def_p (fndecl))
   11131        11281 :             cfun->machine->recursive_function = true;
   11132              :         }
   11133              :     }
   11134              :   else
   11135              :     {
   11136       183770 :       if (MEM_P (fnaddr))
   11137              :         {
   11138       183770 :           tree mem_expr = MEM_EXPR (fnaddr);
   11139       183770 :           if (mem_expr != nullptr
   11140       183725 :               && TREE_CODE (mem_expr) == MEM_REF
   11141       367495 :               && ix86_type_no_callee_saved_registers_p (TREE_TYPE (mem_expr)))
   11142              :             call_no_callee_saved_registers = true;
   11143              :         }
   11144              : 
   11145              :       fndecl = NULL_TREE;
   11146              :     }
   11147              : 
   11148      6240966 :   if (pop == const0_rtx)
   11149            0 :     pop = NULL;
   11150      6240966 :   gcc_assert (!TARGET_64BIT || !pop);
   11151              : 
   11152      6240966 :   rtx addr = XEXP (fnaddr, 0);
   11153      6240966 :   if (TARGET_MACHO && !TARGET_64BIT)
   11154              :     {
   11155              : #if TARGET_MACHO
   11156              :       if (flag_pic && SYMBOL_REF_P (XEXP (fnaddr, 0)))
   11157              :         fnaddr = machopic_indirect_call_target (fnaddr);
   11158              : #endif
   11159              :     }
   11160              :   else
   11161              :     {
   11162              :       /* Static functions and indirect calls don't need the pic register.  Also,
   11163              :          check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
   11164              :          it an indirect call.  */
   11165      6240966 :       if (flag_pic
   11166       527914 :           && SYMBOL_REF_P (addr)
   11167      6741985 :           && ix86_call_use_plt_p (addr))
   11168              :         {
   11169       400738 :           if (flag_plt
   11170       400738 :               && (SYMBOL_REF_DECL (addr) == NULL_TREE
   11171       400704 :                   || !lookup_attribute ("noplt",
   11172       400704 :                                         DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
   11173              :             {
   11174       400703 :               if (!TARGET_64BIT
   11175       221973 :                   || (ix86_cmodel == CM_LARGE_PIC
   11176              :                       && DEFAULT_ABI != MS_ABI))
   11177              :                 {
   11178       536224 :                   use_reg (&use, gen_rtx_REG (Pmode,
   11179              :                                               REAL_PIC_OFFSET_TABLE_REGNUM));
   11180       178764 :                   if (ix86_use_pseudo_pic_reg ())
   11181       357494 :                     emit_move_insn (gen_rtx_REG (Pmode,
   11182       178764 :                                                  REAL_PIC_OFFSET_TABLE_REGNUM),
   11183              :                                     pic_offset_table_rtx);
   11184              :                 }
   11185              :             }
   11186           35 :           else if (!TARGET_PECOFF && !TARGET_MACHO)
   11187              :             {
   11188           35 :               if (TARGET_64BIT
   11189           35 :                   && ix86_cmodel == CM_LARGE_PIC
   11190              :                   && DEFAULT_ABI != MS_ABI)
   11191              :                 {
   11192            1 :                   fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
   11193              :                                            UNSPEC_GOT);
   11194            1 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11195            1 :                   fnaddr = force_reg (Pmode, fnaddr);
   11196            1 :                   fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
   11197              :                 }
   11198           34 :               else if (TARGET_64BIT)
   11199              :                 {
   11200           38 :                   fnaddr = gen_rtx_UNSPEC (Pmode,
   11201              :                                            gen_rtvec (1, addr),
   11202              :                                            UNSPEC_GOTPCREL);
   11203           38 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11204              :                 }
   11205              :               else
   11206              :                 {
   11207            0 :                   fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
   11208              :                                            UNSPEC_GOT);
   11209            0 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11210            0 :                   fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
   11211              :                                          fnaddr);
   11212              :                 }
   11213           39 :               fnaddr = gen_const_mem (Pmode, fnaddr);
   11214              :               /* Pmode may not be the same as word_mode for x32, which
   11215              :                  doesn't support indirect branch via 32-bit memory slot.
   11216              :                  Since x32 GOT slot is 64 bit with zero upper 32 bits,
   11217              :                  indirect branch via x32 GOT slot is OK.  */
   11218           35 :               if (GET_MODE (fnaddr) != word_mode)
   11219            4 :                 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
   11220           35 :               fnaddr = gen_rtx_MEM (QImode, fnaddr);
   11221              :             }
   11222              :         }
   11223              :     }
   11224              : 
   11225              :   /* Skip setting up RAX register for -mskip-rax-setup when there are no
   11226              :      parameters passed in vector registers.  */
   11227      6240966 :   if (TARGET_64BIT
   11228      5400984 :       && (INTVAL (callarg2) > 0
   11229      5339712 :           || (INTVAL (callarg2) == 0
   11230       321180 :               && (TARGET_SSE || !flag_skip_rax_setup))))
   11231              :     {
   11232       382450 :       rtx al = gen_rtx_REG (QImode, AX_REG);
   11233       382450 :       emit_move_insn (al, callarg2);
   11234       382450 :       use_reg (&use, al);
   11235              :     }
   11236              : 
   11237      6240966 :   if (ix86_cmodel == CM_LARGE_PIC
   11238              :       && !TARGET_PECOFF
   11239           45 :       && MEM_P (fnaddr)
   11240           45 :       && SYMBOL_REF_P (XEXP (fnaddr, 0))
   11241      6241003 :       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
   11242           34 :     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
   11243              :   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
   11244              :      branch via x32 GOT slot is OK.  */
   11245      6240932 :   else if (TARGET_X32
   11246           74 :       && MEM_P (fnaddr)
   11247           74 :       && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
   11248            8 :       && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)
   11249      6240936 :       && !TARGET_INDIRECT_BRANCH_REGISTER)
   11250              :     ;
   11251      6240932 :   else if (sibcall
   11252      6240932 :            ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
   11253      6112723 :            : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
   11254              :     {
   11255          532 :       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
   11256          532 :       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
   11257              :     }
   11258              : 
   11259              :   /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
   11260              :      mask off code pointers here.
   11261              :      TODO: also need to handle indirect jump.  */
   11262      6241988 :   if (ix86_memtag_can_tag_addresses () && !fndecl
   11263      6240990 :       && sanitize_flags_p (SANITIZE_HWADDRESS))
   11264              :     {
   11265           24 :       rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
   11266              :                                                         NULL_RTX);
   11267           24 :       fnaddr = gen_rtx_MEM (QImode, untagged_addr);
   11268              :     }
   11269              : 
   11270      6240966 :   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
   11271              : 
   11272      6240966 :   if (retval)
   11273      2462122 :     call = gen_rtx_SET (retval, call);
   11274      6240966 :   vec[vec_len++] = call;
   11275              : 
   11276      6240966 :   if (pop)
   11277              :     {
   11278       450736 :       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
   11279       225368 :       pop = gen_rtx_SET (stack_pointer_rtx, pop);
   11280       225368 :       vec[vec_len++] = pop;
   11281              :     }
   11282              : 
   11283      6240966 :   static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
   11284              : 
   11285      6240966 :   if ((cfun->machine->call_saved_registers
   11286      6240966 :        == TYPE_NO_CALLER_SAVED_REGISTERS)
   11287      6240966 :       && (!fndecl
   11288          468 :           || (!TREE_THIS_VOLATILE (fndecl)
   11289          186 :               && !lookup_attribute ("no_caller_saved_registers",
   11290          186 :                                     TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
   11291              :     {
   11292          182 :       bool is_64bit_ms_abi = (TARGET_64BIT
   11293          182 :                               && ix86_function_abi (fndecl) == MS_ABI);
   11294          182 :       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
   11295              : 
   11296              :       /* If there are no caller-saved registers, add all registers
   11297              :          that are clobbered by the call which returns.  */
   11298        16926 :       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
   11299        16744 :         if (!fixed_regs[i]
   11300         3242 :             && (ix86_call_used_regs[i] == 1
   11301         1506 :                 || (ix86_call_used_regs[i] & c_mask))
   11302         2150 :             && !STACK_REGNO_P (i)
   11303         2150 :             && !MMX_REGNO_P (i))
   11304         2150 :           clobber_reg (&use,
   11305         2150 :                        gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
   11306              :     }
   11307      5400802 :   else if (TARGET_64BIT_MS_ABI
   11308      6314197 :            && (!callarg2 || INTVAL (callarg2) != -2))
   11309              :     {
   11310              :       unsigned i;
   11311              : 
   11312       861848 :       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
   11313              :         {
   11314       795552 :           int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
   11315       795552 :           machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
   11316              : 
   11317       795552 :           clobber_reg (&use, gen_rtx_REG (mode, regno));
   11318              :         }
   11319              : 
   11320              :       /* Set here, but it may get cleared later.  */
   11321        66296 :       if (TARGET_CALL_MS2SYSV_XLOGUES)
   11322              :         {
   11323         7046 :           if (!TARGET_SSE)
   11324              :             ;
   11325              : 
   11326              :           /* Don't break hot-patched functions.  */
   11327         7046 :           else if (ix86_function_ms_hook_prologue (current_function_decl))
   11328              :             ;
   11329              : 
   11330              :           /* TODO: Cases not yet examined.  */
   11331         7046 :           else if (flag_split_stack)
   11332            0 :             warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
   11333              : 
   11334              :           else
   11335              :             {
   11336         7046 :               gcc_assert (!reload_completed);
   11337         7046 :               cfun->machine->call_ms2sysv = true;
   11338              :             }
   11339              :         }
   11340              :     }
   11341              : 
   11342      6240966 :   if (TARGET_MACHO && TARGET_64BIT && !sibcall
   11343              :       && ((SYMBOL_REF_P (addr) && !SYMBOL_REF_LOCAL_P (addr))
   11344              :           || !fndecl || TREE_PUBLIC (fndecl)))
   11345              :     {
   11346              :       /* We allow public functions defined in a TU to bind locally for PIC
   11347              :          code (the default) on 64bit Mach-O.
   11348              :          If such functions are not inlined, we cannot tell at compile-time if
   11349              :          they will be called via the lazy symbol resolver (this can depend on
   11350              :          options given at link-time).  Therefore, we must assume that the lazy
   11351              :          resolver could be used which clobbers R11 and R10.  */
   11352              :       clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
   11353              :       clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
   11354              :     }
   11355              : 
   11356      6240966 :   if (call_no_callee_saved_registers)
   11357              :     {
   11358              :       /* After calling a no_callee_saved_registers function, all
   11359              :          registers may be clobbered.  Clobber all registers that are
   11360              :          not used by the callee.  */
   11361           61 :       bool is_64bit_ms_abi = (TARGET_64BIT
   11362           61 :                               && ix86_function_abi (fndecl) == MS_ABI);
   11363           61 :       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
   11364         5673 :       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
   11365         5612 :         if (!fixed_regs[i]
   11366         2691 :             && i != HARD_FRAME_POINTER_REGNUM
   11367         2630 :             && !(ix86_call_used_regs[i] == 1
   11368         1007 :                  || (ix86_call_used_regs[i] & c_mask))
   11369          305 :             && !STACK_REGNO_P (i)
   11370          305 :             && !MMX_REGNO_P (i))
   11371          305 :           clobber_reg (&use,
   11372          305 :                        gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
   11373              :     }
   11374              : 
   11375      6240966 :   if (vec_len > 1)
   11376       225368 :     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
   11377      6240966 :   rtx_insn *call_insn = emit_call_insn (call);
   11378      6240966 :   if (use)
   11379       601652 :     CALL_INSN_FUNCTION_USAGE (call_insn) = use;
   11380              : 
   11381      6240966 :   return call_insn;
   11382              : }
   11383              : 
   11384              : /* Split simple return with popping POPC bytes from stack to indirect
   11385              :    branch with stack adjustment .  */
   11386              : 
   11387              : void
   11388            0 : ix86_split_simple_return_pop_internal (rtx popc)
   11389              : {
   11390            0 :   struct machine_function *m = cfun->machine;
   11391            0 :   rtx ecx = gen_rtx_REG (SImode, CX_REG);
   11392            0 :   rtx_insn *insn;
   11393              : 
   11394              :   /* There is no "pascal" calling convention in any 64bit ABI.  */
   11395            0 :   gcc_assert (!TARGET_64BIT);
   11396              : 
   11397            0 :   insn = emit_insn (gen_pop (ecx));
   11398            0 :   m->fs.cfa_offset -= UNITS_PER_WORD;
   11399            0 :   m->fs.sp_offset -= UNITS_PER_WORD;
   11400              : 
   11401            0 :   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
   11402            0 :   x = gen_rtx_SET (stack_pointer_rtx, x);
   11403            0 :   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
   11404            0 :   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
   11405            0 :   RTX_FRAME_RELATED_P (insn) = 1;
   11406              : 
   11407            0 :   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
   11408            0 :   x = gen_rtx_SET (stack_pointer_rtx, x);
   11409            0 :   insn = emit_insn (x);
   11410            0 :   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
   11411            0 :   RTX_FRAME_RELATED_P (insn) = 1;
   11412              : 
   11413              :   /* Now return address is in ECX.  */
   11414            0 :   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
   11415            0 : }
   11416              : 
   11417              : /* Errors in the source file can cause expand_expr to return const0_rtx
   11418              :    where we expect a vector.  To avoid crashing, use one of the vector
   11419              :    clear instructions.  */
   11420              : 
   11421              : static rtx
   11422       197970 : safe_vector_operand (rtx x, machine_mode mode)
   11423              : {
   11424            0 :   if (x == const0_rtx)
   11425            0 :     x = CONST0_RTX (mode);
   11426           24 :   return x;
   11427              : }
   11428              : 
   11429              : /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
   11430              : 
   11431              : static rtx
   11432         8994 : ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
   11433              : {
   11434         8994 :   rtx pat;
   11435         8994 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11436         8994 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11437         8994 :   rtx op0 = expand_normal (arg0);
   11438         8994 :   rtx op1 = expand_normal (arg1);
   11439         8994 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11440         8994 :   machine_mode mode0 = insn_data[icode].operand[1].mode;
   11441         8994 :   machine_mode mode1 = insn_data[icode].operand[2].mode;
   11442              : 
   11443         8994 :   if (VECTOR_MODE_P (mode0))
   11444         8983 :     op0 = safe_vector_operand (op0, mode0);
   11445         8994 :   if (VECTOR_MODE_P (mode1))
   11446         8847 :     op1 = safe_vector_operand (op1, mode1);
   11447              : 
   11448         2848 :   if (optimize || !target
   11449         2848 :       || GET_MODE (target) != tmode
   11450        11842 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11451         6199 :     target = gen_reg_rtx (tmode);
   11452              : 
   11453         8994 :   if (GET_MODE (op1) == SImode && mode1 == TImode)
   11454              :     {
   11455            0 :       rtx x = gen_reg_rtx (V4SImode);
   11456            0 :       emit_insn (gen_sse2_loadd (x, op1));
   11457            0 :       op1 = gen_lowpart (TImode, x);
   11458              :     }
   11459              : 
   11460         8994 :   if (!insn_data[icode].operand[1].predicate (op0, mode0))
   11461         1405 :     op0 = copy_to_mode_reg (mode0, op0);
   11462         8994 :   if (!insn_data[icode].operand[2].predicate (op1, mode1))
   11463          817 :     op1 = copy_to_mode_reg (mode1, op1);
   11464              : 
   11465         8994 :   pat = GEN_FCN (icode) (target, op0, op1);
   11466         8994 :   if (! pat)
   11467              :     return 0;
   11468              : 
   11469         8994 :   emit_insn (pat);
   11470              : 
   11471         8994 :   return target;
   11472              : }
   11473              : 
   11474              : /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
   11475              : 
   11476              : static rtx
   11477         1815 : ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
   11478              :                                enum ix86_builtin_func_type m_type,
   11479              :                                enum rtx_code sub_code)
   11480              : {
   11481         1815 :   rtx pat;
   11482         1815 :   unsigned int i, nargs;
   11483         1815 :   bool comparison_p = false;
   11484         1815 :   bool tf_p = false;
   11485         1815 :   bool last_arg_constant = false;
   11486         1815 :   int num_memory = 0;
   11487         1815 :   rtx xops[4];
   11488              : 
   11489         1815 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11490              : 
   11491         1815 :   switch (m_type)
   11492              :     {
   11493              :     case MULTI_ARG_4_DF2_DI_I:
   11494              :     case MULTI_ARG_4_DF2_DI_I1:
   11495              :     case MULTI_ARG_4_SF2_SI_I:
   11496              :     case MULTI_ARG_4_SF2_SI_I1:
   11497              :       nargs = 4;
   11498              :       last_arg_constant = true;
   11499              :       break;
   11500              : 
   11501          844 :     case MULTI_ARG_3_SF:
   11502          844 :     case MULTI_ARG_3_DF:
   11503          844 :     case MULTI_ARG_3_SF2:
   11504          844 :     case MULTI_ARG_3_DF2:
   11505          844 :     case MULTI_ARG_3_DI:
   11506          844 :     case MULTI_ARG_3_SI:
   11507          844 :     case MULTI_ARG_3_SI_DI:
   11508          844 :     case MULTI_ARG_3_HI:
   11509          844 :     case MULTI_ARG_3_HI_SI:
   11510          844 :     case MULTI_ARG_3_QI:
   11511          844 :     case MULTI_ARG_3_DI2:
   11512          844 :     case MULTI_ARG_3_SI2:
   11513          844 :     case MULTI_ARG_3_HI2:
   11514          844 :     case MULTI_ARG_3_QI2:
   11515          844 :       nargs = 3;
   11516          844 :       break;
   11517              : 
   11518          128 :     case MULTI_ARG_2_SF:
   11519          128 :     case MULTI_ARG_2_DF:
   11520          128 :     case MULTI_ARG_2_DI:
   11521          128 :     case MULTI_ARG_2_SI:
   11522          128 :     case MULTI_ARG_2_HI:
   11523          128 :     case MULTI_ARG_2_QI:
   11524          128 :       nargs = 2;
   11525          128 :       break;
   11526              : 
   11527           64 :     case MULTI_ARG_2_DI_IMM:
   11528           64 :     case MULTI_ARG_2_SI_IMM:
   11529           64 :     case MULTI_ARG_2_HI_IMM:
   11530           64 :     case MULTI_ARG_2_QI_IMM:
   11531           64 :       nargs = 2;
   11532           64 :       last_arg_constant = true;
   11533           64 :       break;
   11534              : 
   11535          187 :     case MULTI_ARG_1_SF:
   11536          187 :     case MULTI_ARG_1_DF:
   11537          187 :     case MULTI_ARG_1_SF2:
   11538          187 :     case MULTI_ARG_1_DF2:
   11539          187 :     case MULTI_ARG_1_DI:
   11540          187 :     case MULTI_ARG_1_SI:
   11541          187 :     case MULTI_ARG_1_HI:
   11542          187 :     case MULTI_ARG_1_QI:
   11543          187 :     case MULTI_ARG_1_SI_DI:
   11544          187 :     case MULTI_ARG_1_HI_DI:
   11545          187 :     case MULTI_ARG_1_HI_SI:
   11546          187 :     case MULTI_ARG_1_QI_DI:
   11547          187 :     case MULTI_ARG_1_QI_SI:
   11548          187 :     case MULTI_ARG_1_QI_HI:
   11549          187 :       nargs = 1;
   11550          187 :       break;
   11551              : 
   11552          384 :     case MULTI_ARG_2_DI_CMP:
   11553          384 :     case MULTI_ARG_2_SI_CMP:
   11554          384 :     case MULTI_ARG_2_HI_CMP:
   11555          384 :     case MULTI_ARG_2_QI_CMP:
   11556          384 :       nargs = 2;
   11557          384 :       comparison_p = true;
   11558          384 :       break;
   11559              : 
   11560          128 :     case MULTI_ARG_2_SF_TF:
   11561          128 :     case MULTI_ARG_2_DF_TF:
   11562          128 :     case MULTI_ARG_2_DI_TF:
   11563          128 :     case MULTI_ARG_2_SI_TF:
   11564          128 :     case MULTI_ARG_2_HI_TF:
   11565          128 :     case MULTI_ARG_2_QI_TF:
   11566          128 :       nargs = 2;
   11567          128 :       tf_p = true;
   11568          128 :       break;
   11569              : 
   11570            0 :     default:
   11571            0 :       gcc_unreachable ();
   11572              :     }
   11573              : 
   11574          628 :   if (optimize || !target
   11575          628 :       || GET_MODE (target) != tmode
   11576         2419 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11577         1211 :     target = gen_reg_rtx (tmode);
   11578          604 :   else if (memory_operand (target, tmode))
   11579            0 :     num_memory++;
   11580              : 
   11581         1815 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   11582              : 
   11583         6254 :   for (i = 0; i < nargs; i++)
   11584              :     {
   11585         4447 :       tree arg = CALL_EXPR_ARG (exp, i);
   11586         4447 :       rtx op = expand_normal (arg);
   11587         4447 :       int adjust = (comparison_p) ? 1 : 0;
   11588         4447 :       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
   11589              : 
   11590         4447 :       if (last_arg_constant && i == nargs - 1)
   11591              :         {
   11592          144 :           if (!insn_data[icode].operand[i + 1].predicate (op, mode))
   11593              :             {
   11594           30 :               enum insn_code new_icode = icode;
   11595           30 :               switch (icode)
   11596              :                 {
   11597            8 :                 case CODE_FOR_xop_vpermil2v2df3:
   11598            8 :                 case CODE_FOR_xop_vpermil2v4sf3:
   11599            8 :                 case CODE_FOR_xop_vpermil2v4df3:
   11600            8 :                 case CODE_FOR_xop_vpermil2v8sf3:
   11601            8 :                   error ("the last argument must be a 2-bit immediate");
   11602            8 :                   return gen_reg_rtx (tmode);
   11603            5 :                 case CODE_FOR_xop_rotlv2di3:
   11604            5 :                   new_icode = CODE_FOR_rotlv2di3;
   11605            5 :                   goto xop_rotl;
   11606            5 :                 case CODE_FOR_xop_rotlv4si3:
   11607            5 :                   new_icode = CODE_FOR_rotlv4si3;
   11608            5 :                   goto xop_rotl;
   11609            6 :                 case CODE_FOR_xop_rotlv8hi3:
   11610            6 :                   new_icode = CODE_FOR_rotlv8hi3;
   11611            6 :                   goto xop_rotl;
   11612              :                 case CODE_FOR_xop_rotlv16qi3:
   11613              :                   new_icode = CODE_FOR_rotlv16qi3;
   11614           22 :                 xop_rotl:
   11615           22 :                   if (CONST_INT_P (op))
   11616              :                     {
   11617            6 :                       int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
   11618            6 :                       op = GEN_INT (INTVAL (op) & mask);
   11619            6 :                       gcc_checking_assert
   11620              :                         (insn_data[icode].operand[i + 1].predicate (op, mode));
   11621              :                     }
   11622              :                   else
   11623              :                     {
   11624           16 :                       gcc_checking_assert
   11625              :                         (nargs == 2
   11626              :                          && insn_data[new_icode].operand[0].mode == tmode
   11627              :                          && insn_data[new_icode].operand[1].mode == tmode
   11628              :                          && insn_data[new_icode].operand[2].mode == mode
   11629              :                          && insn_data[new_icode].operand[0].predicate
   11630              :                             == insn_data[icode].operand[0].predicate
   11631              :                          && insn_data[new_icode].operand[1].predicate
   11632              :                             == insn_data[icode].operand[1].predicate);
   11633           16 :                       icode = new_icode;
   11634           16 :                       goto non_constant;
   11635              :                     }
   11636              :                   break;
   11637            0 :                 default:
   11638            0 :                   gcc_unreachable ();
   11639              :                 }
   11640              :             }
   11641              :         }
   11642              :       else
   11643              :         {
   11644         4303 :         non_constant:
   11645         4319 :           if (VECTOR_MODE_P (mode))
   11646         4303 :             op = safe_vector_operand (op, mode);
   11647              : 
   11648              :           /* If we aren't optimizing, only allow one memory operand to be
   11649              :              generated.  */
   11650         4319 :           if (memory_operand (op, mode))
   11651          826 :             num_memory++;
   11652              : 
   11653         4319 :           gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
   11654              : 
   11655         4319 :           if (optimize
   11656         1506 :               || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
   11657         5747 :               || num_memory > 1)
   11658         3398 :             op = force_reg (mode, op);
   11659              :         }
   11660              : 
   11661         4439 :       xops[i] = op;
   11662              :     }
   11663              : 
   11664         1807 :   switch (nargs)
   11665              :     {
   11666          187 :     case 1:
   11667          187 :       pat = GEN_FCN (icode) (target, xops[0]);
   11668          187 :       break;
   11669              : 
   11670          704 :     case 2:
   11671          704 :       if (tf_p)
   11672          128 :         pat = GEN_FCN (icode) (target, xops[0], xops[1],
   11673          128 :                                GEN_INT ((int)sub_code));
   11674          576 :       else if (! comparison_p)
   11675          192 :         pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   11676              :       else
   11677              :         {
   11678          384 :           rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
   11679              :                                        xops[0], xops[1]);
   11680              : 
   11681          384 :           pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
   11682              :         }
   11683              :       break;
   11684              : 
   11685          844 :     case 3:
   11686          844 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   11687          844 :       break;
   11688              : 
   11689           72 :     case 4:
   11690           72 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
   11691           72 :       break;
   11692              : 
   11693              :     default:
   11694              :       gcc_unreachable ();
   11695              :     }
   11696              : 
   11697         1807 :   if (! pat)
   11698              :     return 0;
   11699              : 
   11700         1807 :   emit_insn (pat);
   11701         1807 :   return target;
   11702              : }
   11703              : 
   11704              : /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
   11705              :    insns with vec_merge.  */
   11706              : 
   11707              : static rtx
   11708           52 : ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
   11709              :                                     rtx target)
   11710              : {
   11711           52 :   rtx pat;
   11712           52 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11713           52 :   rtx op1, op0 = expand_normal (arg0);
   11714           52 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11715           52 :   machine_mode mode0 = insn_data[icode].operand[1].mode;
   11716              : 
   11717           16 :   if (optimize || !target
   11718           16 :       || GET_MODE (target) != tmode
   11719           68 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11720           36 :     target = gen_reg_rtx (tmode);
   11721              : 
   11722           52 :   if (VECTOR_MODE_P (mode0))
   11723           52 :     op0 = safe_vector_operand (op0, mode0);
   11724              : 
   11725           36 :   if ((optimize && !register_operand (op0, mode0))
   11726           88 :       || !insn_data[icode].operand[1].predicate (op0, mode0))
   11727            0 :     op0 = copy_to_mode_reg (mode0, op0);
   11728              : 
   11729           52 :   op1 = op0;
   11730           52 :   if (!insn_data[icode].operand[2].predicate (op1, mode0))
   11731           16 :     op1 = copy_to_mode_reg (mode0, op1);
   11732              : 
   11733           52 :   pat = GEN_FCN (icode) (target, op0, op1);
   11734           52 :   if (! pat)
   11735              :     return 0;
   11736           52 :   emit_insn (pat);
   11737           52 :   return target;
   11738              : }
   11739              : 
   11740              : /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
   11741              : 
   11742              : static rtx
   11743          614 : ix86_expand_sse_compare (const struct builtin_description *d,
   11744              :                          tree exp, rtx target, bool swap)
   11745              : {
   11746          614 :   rtx pat;
   11747          614 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11748          614 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11749          614 :   rtx op0 = expand_normal (arg0);
   11750          614 :   rtx op1 = expand_normal (arg1);
   11751          614 :   rtx op2;
   11752          614 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11753          614 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11754          614 :   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
   11755          614 :   enum rtx_code comparison = d->comparison;
   11756              : 
   11757          614 :   if (VECTOR_MODE_P (mode0))
   11758          614 :     op0 = safe_vector_operand (op0, mode0);
   11759          614 :   if (VECTOR_MODE_P (mode1))
   11760          614 :     op1 = safe_vector_operand (op1, mode1);
   11761              : 
   11762              :   /* Swap operands if we have a comparison that isn't available in
   11763              :      hardware.  */
   11764          614 :   if (swap)
   11765           80 :     std::swap (op0, op1);
   11766              : 
   11767          202 :   if (optimize || !target
   11768          202 :       || GET_MODE (target) != tmode
   11769          816 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11770          412 :     target = gen_reg_rtx (tmode);
   11771              : 
   11772          412 :   if ((optimize && !register_operand (op0, mode0))
   11773          956 :       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
   11774          272 :     op0 = copy_to_mode_reg (mode0, op0);
   11775          412 :   if ((optimize && !register_operand (op1, mode1))
   11776          972 :       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
   11777           54 :     op1 = copy_to_mode_reg (mode1, op1);
   11778              : 
   11779          614 :   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
   11780          614 :   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
   11781          614 :   if (! pat)
   11782              :     return 0;
   11783          614 :   emit_insn (pat);
   11784          614 :   return target;
   11785              : }
   11786              : 
   11787              : /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
   11788              :  * ordered EQ or unordered NE, generate PF jump.  */
   11789              : 
   11790              : static rtx
   11791          646 : ix86_ssecom_setcc (const enum rtx_code comparison,
   11792              :                    bool check_unordered, machine_mode mode,
   11793              :                    rtx set_dst, rtx target)
   11794              : {
   11795              : 
   11796          646 :   rtx_code_label *label = NULL;
   11797              : 
   11798              :   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
   11799              :      with NAN operands.
   11800              :      Under TARGET_AVX10_2, VCOMX/VUCOMX are generated instead of
   11801              :      COMI/UCOMI.  VCOMX/VUCOMX will not set ZF for NAN operands.  */
   11802          646 :   if (check_unordered)
   11803              :     {
   11804          122 :       gcc_assert (comparison == EQ || comparison == NE);
   11805              : 
   11806          122 :       rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
   11807          122 :       label = gen_label_rtx ();
   11808          122 :       rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
   11809          122 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   11810              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
   11811              :                                   pc_rtx);
   11812          122 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   11813              :     }
   11814              : 
   11815              :   /* NB: Set CCFPmode and check a different CCmode which is in subset
   11816              :      of CCFPmode.  */
   11817          646 :   if (GET_MODE (set_dst) != mode)
   11818              :     {
   11819          200 :       gcc_assert (mode == CCAmode || mode == CCCmode
   11820              :                   || mode == CCOmode || mode == CCPmode
   11821              :                   || mode == CCSmode || mode == CCZmode);
   11822          200 :       set_dst = gen_rtx_REG (mode, FLAGS_REG);
   11823              :     }
   11824              : 
   11825          646 :   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   11826              :                           gen_rtx_fmt_ee (comparison, QImode,
   11827              :                                           set_dst,
   11828              :                                           const0_rtx)));
   11829              : 
   11830          646 :   if (label)
   11831          122 :     emit_label (label);
   11832              : 
   11833          646 :   return SUBREG_REG (target);
   11834              : }
   11835              : 
   11836              : /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
   11837              : 
   11838              : static rtx
   11839          547 : ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
   11840              :                       rtx target, bool comx_ok)
   11841              : {
   11842          547 :   rtx pat, set_dst;
   11843          547 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11844          547 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11845          547 :   rtx op0 = expand_normal (arg0);
   11846          547 :   rtx op1 = expand_normal (arg1);
   11847          547 :   enum insn_code icode = d->icode;
   11848          547 :   const struct insn_data_d *insn_p = &insn_data[icode];
   11849          547 :   machine_mode mode0 = insn_p->operand[0].mode;
   11850          547 :   machine_mode mode1 = insn_p->operand[1].mode;
   11851              : 
   11852          547 :   if (VECTOR_MODE_P (mode0))
   11853          547 :     op0 = safe_vector_operand (op0, mode0);
   11854          547 :   if (VECTOR_MODE_P (mode1))
   11855          547 :     op1 = safe_vector_operand (op1, mode1);
   11856              : 
   11857          547 :   enum rtx_code comparison = d->comparison;
   11858          547 :   rtx const_val = const0_rtx;
   11859              : 
   11860          547 :   bool check_unordered = false;
   11861          547 :   machine_mode mode = CCFPmode;
   11862          547 :   switch (comparison)
   11863              :     {
   11864          194 :     case LE:    /* -> GE  */
   11865          194 :     case LT:    /* -> GT  */
   11866          194 :       std::swap (op0, op1);
   11867          194 :       comparison = swap_condition (comparison);
   11868              :       /* FALLTHRU */
   11869              :     case GT:
   11870              :     case GE:
   11871              :       break;
   11872           73 :     case EQ:
   11873           73 :       if (!TARGET_AVX10_2 || !comx_ok)
   11874           45 :         check_unordered = true;
   11875              :       mode = CCZmode;
   11876              :       break;
   11877           96 :     case NE:
   11878           96 :       if (!TARGET_AVX10_2 || !comx_ok)
   11879           68 :         check_unordered = true;
   11880           96 :       mode = CCZmode;
   11881           96 :       const_val = const1_rtx;
   11882           96 :       break;
   11883            0 :     default:
   11884            0 :       gcc_unreachable ();
   11885              :     }
   11886              : 
   11887          547 :   target = gen_reg_rtx (SImode);
   11888          547 :   emit_move_insn (target, const_val);
   11889          547 :   target = gen_rtx_SUBREG (QImode, target, 0);
   11890              : 
   11891          426 :   if ((optimize && !register_operand (op0, mode0))
   11892          925 :       || !insn_p->operand[0].predicate (op0, mode0))
   11893          169 :     op0 = copy_to_mode_reg (mode0, op0);
   11894          426 :   if ((optimize && !register_operand (op1, mode1))
   11895          924 :       || !insn_p->operand[1].predicate (op1, mode1))
   11896           49 :     op1 = copy_to_mode_reg (mode1, op1);
   11897              : 
   11898          547 :   if ((comparison == EQ || comparison == NE)
   11899          169 :       && TARGET_AVX10_2 && comx_ok)
   11900              :     {
   11901           56 :       switch (icode)
   11902              :         {
   11903              :         case CODE_FOR_sse_comi:
   11904              :           icode = CODE_FOR_avx10_2_comxsf;
   11905              :           break;
   11906           14 :         case CODE_FOR_sse_ucomi:
   11907           14 :           icode = CODE_FOR_avx10_2_ucomxsf;
   11908           14 :           break;
   11909           14 :         case CODE_FOR_sse2_comi:
   11910           14 :           icode = CODE_FOR_avx10_2_comxdf;
   11911           14 :           break;
   11912           14 :         case CODE_FOR_sse2_ucomi:
   11913           14 :           icode = CODE_FOR_avx10_2_ucomxdf;
   11914           14 :           break;
   11915              : 
   11916            0 :         default:
   11917            0 :           gcc_unreachable ();
   11918              :         }
   11919              :     }
   11920          547 :   pat = GEN_FCN (icode) (op0, op1);
   11921          547 :   if (! pat)
   11922              :     return 0;
   11923              : 
   11924          547 :   set_dst = SET_DEST (pat);
   11925          547 :   emit_insn (pat);
   11926          547 :   return ix86_ssecom_setcc (comparison, check_unordered, mode,
   11927          547 :                             set_dst, target);
   11928              : }
   11929              : 
   11930              : /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
   11931              : 
   11932              : static rtx
   11933            0 : ix86_expand_sse_round (const struct builtin_description *d, tree exp,
   11934              :                        rtx target)
   11935              : {
   11936            0 :   rtx pat;
   11937            0 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11938            0 :   rtx op1, op0 = expand_normal (arg0);
   11939            0 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11940            0 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11941              : 
   11942            0 :   if (optimize || target == 0
   11943            0 :       || GET_MODE (target) != tmode
   11944            0 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11945            0 :     target = gen_reg_rtx (tmode);
   11946              : 
   11947            0 :   if (VECTOR_MODE_P (mode0))
   11948            0 :     op0 = safe_vector_operand (op0, mode0);
   11949              : 
   11950            0 :   if ((optimize && !register_operand (op0, mode0))
   11951            0 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   11952            0 :     op0 = copy_to_mode_reg (mode0, op0);
   11953              : 
   11954            0 :   op1 = GEN_INT (d->comparison);
   11955              : 
   11956            0 :   pat = GEN_FCN (d->icode) (target, op0, op1);
   11957            0 :   if (! pat)
   11958              :     return 0;
   11959            0 :   emit_insn (pat);
   11960            0 :   return target;
   11961              : }
   11962              : 
   11963              : static rtx
   11964           12 : ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
   11965              :                                      tree exp, rtx target)
   11966              : {
   11967           12 :   rtx pat;
   11968           12 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11969           12 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11970           12 :   rtx op0 = expand_normal (arg0);
   11971           12 :   rtx op1 = expand_normal (arg1);
   11972           12 :   rtx op2;
   11973           12 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11974           12 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11975           12 :   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
   11976              : 
   11977            0 :   if (optimize || target == 0
   11978            0 :       || GET_MODE (target) != tmode
   11979           12 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11980           12 :     target = gen_reg_rtx (tmode);
   11981              : 
   11982           12 :   op0 = safe_vector_operand (op0, mode0);
   11983           12 :   op1 = safe_vector_operand (op1, mode1);
   11984              : 
   11985           12 :   if ((optimize && !register_operand (op0, mode0))
   11986           12 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   11987           12 :     op0 = copy_to_mode_reg (mode0, op0);
   11988           12 :   if ((optimize && !register_operand (op1, mode1))
   11989           12 :       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
   11990           12 :     op1 = copy_to_mode_reg (mode1, op1);
   11991              : 
   11992           12 :   op2 = GEN_INT (d->comparison);
   11993              : 
   11994           12 :   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
   11995           12 :   if (! pat)
   11996              :     return 0;
   11997           12 :   emit_insn (pat);
   11998           12 :   return target;
   11999              : }
   12000              : 
   12001              : /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
   12002              : 
   12003              : static rtx
   12004          239 : ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
   12005              :                        rtx target)
   12006              : {
   12007          239 :   rtx pat;
   12008          239 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   12009          239 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   12010          239 :   rtx op0 = expand_normal (arg0);
   12011          239 :   rtx op1 = expand_normal (arg1);
   12012          239 :   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
   12013          239 :   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   12014          239 :   enum rtx_code comparison = d->comparison;
   12015          239 :   rtx result = NULL_RTX;
   12016              : 
   12017          239 :   if (VECTOR_MODE_P (mode0))
   12018          239 :     op0 = safe_vector_operand (op0, mode0);
   12019          239 :   if (VECTOR_MODE_P (mode1))
   12020          239 :     op1 = safe_vector_operand (op1, mode1);
   12021              : 
   12022          239 :   switch (d->code)
   12023              :     {
   12024           49 :     case IX86_BUILTIN_PTESTZ:
   12025           49 :     case IX86_BUILTIN_PTESTZ256:
   12026              :       // Returns (OP0 & OP1) == 0
   12027           49 :       if (rtx_equal_p (op0, CONST0_RTX (mode0))
   12028           49 :           || rtx_equal_p (op1, CONST0_RTX (mode1)))
   12029            2 :         result = const1_rtx;
   12030           47 :       else if (rtx_equal_p (op0, CONSTM1_RTX (mode0)))
   12031              :         {
   12032            1 :           op1 = force_reg (mode1, op1);
   12033            1 :           op0 = op1;
   12034              :         }
   12035           46 :       else if (rtx_equal_p (op1, CONSTM1_RTX (mode1)))
   12036              :         {
   12037            1 :           op0 = force_reg (mode0, op0);
   12038            1 :           op1 = op0;
   12039              :         }
   12040           45 :       else if (MEM_P (op0) && !MEM_P (op1))
   12041              :         std::swap (op0, op1);
   12042              :       break;
   12043              : 
   12044           31 :     case IX86_BUILTIN_PTESTC:
   12045           31 :     case IX86_BUILTIN_PTESTC256:
   12046              :       // Returns (~OP0 & OP1) == 0
   12047           31 :       if (rtx_equal_p (op0, CONSTM1_RTX (mode0))
   12048           31 :           || rtx_equal_p (op1, CONST0_RTX (mode1))
   12049           62 :           || rtx_equal_p (op0, op1))
   12050            2 :         result = const1_rtx;
   12051              :       break;
   12052              : 
   12053           27 :     case IX86_BUILTIN_PTESTNZC:
   12054           27 :     case IX86_BUILTIN_PTESTNZC256:
   12055              :       // Returns ((OP0 && OP1) != 0) && ((~OP0 && OP1) != 0)
   12056           27 :       if (rtx_equal_p (op0, CONST0_RTX (mode0))
   12057           26 :           || rtx_equal_p (op0, CONSTM1_RTX (mode0))
   12058           26 :           || rtx_equal_p (op1, CONST0_RTX (mode1))
   12059           53 :           || rtx_equal_p (op0, op1))
   12060            1 :         result = const0_rtx;
   12061              :       break;
   12062              : 
   12063              :     default:
   12064              :       break;
   12065              :     }
   12066              : 
   12067          167 :   if ((optimize && !register_operand (op0, mode0))
   12068          210 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0)
   12069          377 :       || result)
   12070          104 :     op0 = copy_to_mode_reg (mode0, op0);
   12071          167 :   if ((optimize && !register_operand (op1, mode1))
   12072          211 :       || !insn_data[d->icode].operand[1].predicate (op1, mode1)
   12073          450 :       || result)
   12074           31 :     op1 = copy_to_mode_reg (mode1, op1);
   12075              : 
   12076          239 :   if (result)
   12077              :     {
   12078            5 :       if (!target)
   12079            0 :         target = gen_reg_rtx (SImode);
   12080            5 :       emit_move_insn (target, result);
   12081            5 :       return target;
   12082              :     }
   12083              : 
   12084          234 :   target = gen_reg_rtx (SImode);
   12085          234 :   emit_move_insn (target, const0_rtx);
   12086          234 :   target = gen_rtx_SUBREG (QImode, target, 0);
   12087              : 
   12088          234 :   pat = GEN_FCN (d->icode) (op0, op1);
   12089          234 :   if (! pat)
   12090              :     return 0;
   12091          234 :   emit_insn (pat);
   12092          234 :   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12093              :                           gen_rtx_fmt_ee (comparison, QImode,
   12094              :                                           SET_DEST (pat),
   12095              :                                           const0_rtx)));
   12096              : 
   12097          234 :   return SUBREG_REG (target);
   12098              : }
   12099              : 
   12100              : /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
   12101              : 
   12102              : static rtx
   12103          216 : ix86_expand_sse_pcmpestr (const struct builtin_description *d,
   12104              :                           tree exp, rtx target)
   12105              : {
   12106          216 :   rtx pat;
   12107          216 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   12108          216 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   12109          216 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   12110          216 :   tree arg3 = CALL_EXPR_ARG (exp, 3);
   12111          216 :   tree arg4 = CALL_EXPR_ARG (exp, 4);
   12112          216 :   rtx scratch0, scratch1;
   12113          216 :   rtx op0 = expand_normal (arg0);
   12114          216 :   rtx op1 = expand_normal (arg1);
   12115          216 :   rtx op2 = expand_normal (arg2);
   12116          216 :   rtx op3 = expand_normal (arg3);
   12117          216 :   rtx op4 = expand_normal (arg4);
   12118          216 :   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
   12119              : 
   12120          216 :   tmode0 = insn_data[d->icode].operand[0].mode;
   12121          216 :   tmode1 = insn_data[d->icode].operand[1].mode;
   12122          216 :   modev2 = insn_data[d->icode].operand[2].mode;
   12123          216 :   modei3 = insn_data[d->icode].operand[3].mode;
   12124          216 :   modev4 = insn_data[d->icode].operand[4].mode;
   12125          216 :   modei5 = insn_data[d->icode].operand[5].mode;
   12126          216 :   modeimm = insn_data[d->icode].operand[6].mode;
   12127              : 
   12128          216 :   if (VECTOR_MODE_P (modev2))
   12129          216 :     op0 = safe_vector_operand (op0, modev2);
   12130          216 :   if (VECTOR_MODE_P (modev4))
   12131          216 :     op2 = safe_vector_operand (op2, modev4);
   12132              : 
   12133          216 :   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
   12134            6 :     op0 = copy_to_mode_reg (modev2, op0);
   12135          216 :   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
   12136           34 :     op1 = copy_to_mode_reg (modei3, op1);
   12137          160 :   if ((optimize && !register_operand (op2, modev4))
   12138          371 :       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
   12139            5 :     op2 = copy_to_mode_reg (modev4, op2);
   12140          216 :   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
   12141           34 :     op3 = copy_to_mode_reg (modei5, op3);
   12142              : 
   12143          216 :   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
   12144              :     {
   12145           21 :       error ("the fifth argument must be an 8-bit immediate");
   12146           21 :       return const0_rtx;
   12147              :     }
   12148              : 
   12149          195 :   if (d->code == IX86_BUILTIN_PCMPESTRI128)
   12150              :     {
   12151            5 :       if (optimize || !target
   12152            5 :           || GET_MODE (target) != tmode0
   12153           34 :           || !insn_data[d->icode].operand[0].predicate (target, tmode0))
   12154           24 :         target = gen_reg_rtx (tmode0);
   12155              : 
   12156           29 :       scratch1 = gen_reg_rtx (tmode1);
   12157              : 
   12158           29 :       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
   12159              :     }
   12160          166 :   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
   12161              :     {
   12162            5 :       if (optimize || !target
   12163            5 :           || GET_MODE (target) != tmode1
   12164           36 :           || !insn_data[d->icode].operand[1].predicate (target, tmode1))
   12165           26 :         target = gen_reg_rtx (tmode1);
   12166              : 
   12167           31 :       scratch0 = gen_reg_rtx (tmode0);
   12168              : 
   12169           31 :       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
   12170              :     }
   12171              :   else
   12172              :     {
   12173          135 :       gcc_assert (d->flag);
   12174              : 
   12175          135 :       scratch0 = gen_reg_rtx (tmode0);
   12176          135 :       scratch1 = gen_reg_rtx (tmode1);
   12177              : 
   12178          135 :       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
   12179              :     }
   12180              : 
   12181          195 :   if (! pat)
   12182              :     return 0;
   12183              : 
   12184          195 :   emit_insn (pat);
   12185              : 
   12186          195 :   if (d->flag)
   12187              :     {
   12188          135 :       target = gen_reg_rtx (SImode);
   12189          135 :       emit_move_insn (target, const0_rtx);
   12190          135 :       target = gen_rtx_SUBREG (QImode, target, 0);
   12191              : 
   12192          135 :       emit_insn
   12193          135 :         (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12194              :                       gen_rtx_fmt_ee (EQ, QImode,
   12195              :                                       gen_rtx_REG ((machine_mode) d->flag,
   12196              :                                                    FLAGS_REG),
   12197              :                                       const0_rtx)));
   12198          135 :       return SUBREG_REG (target);
   12199              :     }
   12200              :   else
   12201              :     return target;
   12202              : }
   12203              : 
   12204              : 
   12205              : /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
   12206              : 
   12207              : static rtx
   12208          275 : ix86_expand_sse_pcmpistr (const struct builtin_description *d,
   12209              :                           tree exp, rtx target)
   12210              : {
   12211          275 :   rtx pat;
   12212          275 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   12213          275 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   12214          275 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   12215          275 :   rtx scratch0, scratch1;
   12216          275 :   rtx op0 = expand_normal (arg0);
   12217          275 :   rtx op1 = expand_normal (arg1);
   12218          275 :   rtx op2 = expand_normal (arg2);
   12219          275 :   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
   12220              : 
   12221          275 :   tmode0 = insn_data[d->icode].operand[0].mode;
   12222          275 :   tmode1 = insn_data[d->icode].operand[1].mode;
   12223          275 :   modev2 = insn_data[d->icode].operand[2].mode;
   12224          275 :   modev3 = insn_data[d->icode].operand[3].mode;
   12225          275 :   modeimm = insn_data[d->icode].operand[4].mode;
   12226              : 
   12227          275 :   if (VECTOR_MODE_P (modev2))
   12228          275 :     op0 = safe_vector_operand (op0, modev2);
   12229          275 :   if (VECTOR_MODE_P (modev3))
   12230          275 :     op1 = safe_vector_operand (op1, modev3);
   12231              : 
   12232          275 :   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
   12233            4 :     op0 = copy_to_mode_reg (modev2, op0);
   12234          210 :   if ((optimize && !register_operand (op1, modev3))
   12235          481 :       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
   12236            4 :     op1 = copy_to_mode_reg (modev3, op1);
   12237              : 
   12238          275 :   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
   12239              :     {
   12240           21 :       error ("the third argument must be an 8-bit immediate");
   12241           21 :       return const0_rtx;
   12242              :     }
   12243              : 
   12244          254 :   if (d->code == IX86_BUILTIN_PCMPISTRI128)
   12245              :     {
   12246            5 :       if (optimize || !target
   12247            5 :           || GET_MODE (target) != tmode0
   12248           38 :           || !insn_data[d->icode].operand[0].predicate (target, tmode0))
   12249           28 :         target = gen_reg_rtx (tmode0);
   12250              : 
   12251           33 :       scratch1 = gen_reg_rtx (tmode1);
   12252              : 
   12253           33 :       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
   12254              :     }
   12255          221 :   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
   12256              :     {
   12257            8 :       if (optimize || !target
   12258            8 :           || GET_MODE (target) != tmode1
   12259           58 :           || !insn_data[d->icode].operand[1].predicate (target, tmode1))
   12260           42 :         target = gen_reg_rtx (tmode1);
   12261              : 
   12262           50 :       scratch0 = gen_reg_rtx (tmode0);
   12263              : 
   12264           50 :       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
   12265              :     }
   12266              :   else
   12267              :     {
   12268          171 :       gcc_assert (d->flag);
   12269              : 
   12270          171 :       scratch0 = gen_reg_rtx (tmode0);
   12271          171 :       scratch1 = gen_reg_rtx (tmode1);
   12272              : 
   12273          171 :       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
   12274              :     }
   12275              : 
   12276          254 :   if (! pat)
   12277              :     return 0;
   12278              : 
   12279          254 :   emit_insn (pat);
   12280              : 
   12281          254 :   if (d->flag)
   12282              :     {
   12283          171 :       target = gen_reg_rtx (SImode);
   12284          171 :       emit_move_insn (target, const0_rtx);
   12285          171 :       target = gen_rtx_SUBREG (QImode, target, 0);
   12286              : 
   12287          171 :       emit_insn
   12288          171 :         (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12289              :                       gen_rtx_fmt_ee (EQ, QImode,
   12290              :                                       gen_rtx_REG ((machine_mode) d->flag,
   12291              :                                                    FLAGS_REG),
   12292              :                                       const0_rtx)));
   12293          171 :       return SUBREG_REG (target);
   12294              :     }
   12295              :   else
   12296              :     return target;
   12297              : }
   12298              : 
   12299              : /* Fixup modeless constants to fit required mode.  */
   12300              : 
   12301              : static rtx
   12302       260756 : fixup_modeless_constant (rtx x, machine_mode mode)
   12303              : {
   12304       260756 :   if (GET_MODE (x) == VOIDmode)
   12305        41433 :     x = convert_to_mode (mode, x, 1);
   12306       260756 :   return x;
   12307              : }
   12308              : 
   12309              : /* Expand the outgoing argument ARG to extract unsigned char and short
   12310              :    integer constants suitable for the predicates and the instruction
   12311              :    templates which expect the unsigned expanded value.  */
   12312              : 
   12313              : static rtx
   12314       281980 : ix86_expand_unsigned_small_int_cst_argument (tree arg)
   12315              : {
   12316              :   /* When passing 0xff as an unsigned char function argument with the
   12317              :      C frontend promotion, expand_normal gets
   12318              : 
   12319              :      <integer_cst 0x7fffe6aa23a8 type <integer_type 0x7fffe98225e8 int> constant 255>
   12320              : 
   12321              :      and returns the rtx value using the sign-extended representation:
   12322              : 
   12323              :      (const_int 255 [0xff])
   12324              : 
   12325              :      Without the C frontend promotion, expand_normal gets
   12326              : 
   12327              :      <integer_cst 0x7fffe9824018 type <integer_type 0x7fffe9822348 unsigned char > constant 255>
   12328              : 
   12329              :      and returns
   12330              : 
   12331              :      (const_int -1 [0xffffffffffffffff])
   12332              : 
   12333              :      which doesn't work with the predicates nor the instruction templates
   12334              :      which expect the unsigned expanded value.  Extract the unsigned char
   12335              :      and short integer constants to return
   12336              : 
   12337              :      (const_int 255 [0xff])
   12338              : 
   12339              :      so that the expanded value is always unsigned, without the C frontend
   12340              :      promotion.  */
   12341              : 
   12342       281980 :   if (TREE_CODE (arg) == INTEGER_CST)
   12343              :     {
   12344        60302 :       tree type = TREE_TYPE (arg);
   12345        60302 :       if (INTEGRAL_TYPE_P (type)
   12346        60302 :           && TYPE_UNSIGNED (type)
   12347        82107 :           && TYPE_PRECISION (type) < TYPE_PRECISION (integer_type_node))
   12348              :         {
   12349        18319 :           HOST_WIDE_INT cst = TREE_INT_CST_LOW (arg);
   12350        18319 :           return GEN_INT (cst);
   12351              :         }
   12352              :     }
   12353              : 
   12354       263661 :   return expand_normal (arg);
   12355              : }
   12356              : 
   12357              : /* Subroutine of ix86_expand_builtin to take care of insns with
   12358              :    variable number of operands.  */
   12359              : 
   12360              : static rtx
   12361        71001 : ix86_expand_args_builtin (const struct builtin_description *d,
   12362              :                           tree exp, rtx target)
   12363              : {
   12364        71001 :   rtx pat, real_target;
   12365        71001 :   unsigned int i, nargs;
   12366        71001 :   unsigned int nargs_constant = 0;
   12367        71001 :   unsigned int mask_pos = 0;
   12368        71001 :   int num_memory = 0;
   12369        71001 :   rtx xops[6];
   12370        71001 :   bool second_arg_count = false;
   12371        71001 :   enum insn_code icode = d->icode;
   12372        71001 :   const struct insn_data_d *insn_p = &insn_data[icode];
   12373        71001 :   machine_mode tmode = insn_p->operand[0].mode;
   12374        71001 :   machine_mode rmode = VOIDmode;
   12375        71001 :   bool swap = false;
   12376        71001 :   enum rtx_code comparison = d->comparison;
   12377              : 
   12378        71001 :   switch ((enum ix86_builtin_func_type) d->flag)
   12379              :     {
   12380            0 :     case V2DF_FTYPE_V2DF_ROUND:
   12381            0 :     case V4DF_FTYPE_V4DF_ROUND:
   12382            0 :     case V8DF_FTYPE_V8DF_ROUND:
   12383            0 :     case V4SF_FTYPE_V4SF_ROUND:
   12384            0 :     case V8SF_FTYPE_V8SF_ROUND:
   12385            0 :     case V16SF_FTYPE_V16SF_ROUND:
   12386            0 :     case V8HF_FTYPE_V8HF_ROUND:
   12387            0 :     case V16HF_FTYPE_V16HF_ROUND:
   12388            0 :     case V32HF_FTYPE_V32HF_ROUND:
   12389            0 :     case V4SI_FTYPE_V4SF_ROUND:
   12390            0 :     case V8SI_FTYPE_V8SF_ROUND:
   12391            0 :     case V16SI_FTYPE_V16SF_ROUND:
   12392            0 :       return ix86_expand_sse_round (d, exp, target);
   12393           12 :     case V4SI_FTYPE_V2DF_V2DF_ROUND:
   12394           12 :     case V8SI_FTYPE_V4DF_V4DF_ROUND:
   12395           12 :     case V16SI_FTYPE_V8DF_V8DF_ROUND:
   12396           12 :       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
   12397          239 :     case INT_FTYPE_V8SF_V8SF_PTEST:
   12398          239 :     case INT_FTYPE_V4DI_V4DI_PTEST:
   12399          239 :     case INT_FTYPE_V4DF_V4DF_PTEST:
   12400          239 :     case INT_FTYPE_V4SF_V4SF_PTEST:
   12401          239 :     case INT_FTYPE_V2DI_V2DI_PTEST:
   12402          239 :     case INT_FTYPE_V2DF_V2DF_PTEST:
   12403          239 :       return ix86_expand_sse_ptest (d, exp, target);
   12404              :     case FLOAT128_FTYPE_FLOAT128:
   12405              :     case FLOAT_FTYPE_FLOAT:
   12406              :     case FLOAT_FTYPE_BFLOAT16:
   12407              :     case INT_FTYPE_INT:
   12408              :     case UINT_FTYPE_UINT:
   12409              :     case UINT16_FTYPE_UINT16:
   12410              :     case UINT64_FTYPE_INT:
   12411              :     case UINT64_FTYPE_UINT64:
   12412              :     case INT64_FTYPE_INT64:
   12413              :     case INT64_FTYPE_V4SF:
   12414              :     case INT64_FTYPE_V2DF:
   12415              :     case INT_FTYPE_V16QI:
   12416              :     case INT_FTYPE_V8QI:
   12417              :     case INT_FTYPE_V8SF:
   12418              :     case INT_FTYPE_V4DF:
   12419              :     case INT_FTYPE_V4SF:
   12420              :     case INT_FTYPE_V2DF:
   12421              :     case INT_FTYPE_V32QI:
   12422              :     case V16QI_FTYPE_V16QI:
   12423              :     case V8SI_FTYPE_V8SF:
   12424              :     case V8SI_FTYPE_V4SI:
   12425              :     case V8HI_FTYPE_V8HI:
   12426              :     case V8HI_FTYPE_V16QI:
   12427              :     case V8QI_FTYPE_V8QI:
   12428              :     case V8SF_FTYPE_V8SF:
   12429              :     case V8SF_FTYPE_V8SI:
   12430              :     case V8SF_FTYPE_V4SF:
   12431              :     case V8SF_FTYPE_V8HI:
   12432              :     case V4SI_FTYPE_V4SI:
   12433              :     case V4SI_FTYPE_V16QI:
   12434              :     case V4SI_FTYPE_V4SF:
   12435              :     case V4SI_FTYPE_V8SI:
   12436              :     case V4SI_FTYPE_V8HI:
   12437              :     case V4SI_FTYPE_V4DF:
   12438              :     case V4SI_FTYPE_V2DF:
   12439              :     case V4HI_FTYPE_V4HI:
   12440              :     case V4DF_FTYPE_V4DF:
   12441              :     case V4DF_FTYPE_V4SI:
   12442              :     case V4DF_FTYPE_V4SF:
   12443              :     case V4DF_FTYPE_V2DF:
   12444              :     case V4SF_FTYPE_V4SF:
   12445              :     case V4SF_FTYPE_V4SI:
   12446              :     case V4SF_FTYPE_V8SF:
   12447              :     case V4SF_FTYPE_V4DF:
   12448              :     case V4SF_FTYPE_V8HI:
   12449              :     case V4SF_FTYPE_V2DF:
   12450              :     case V2DI_FTYPE_V2DI:
   12451              :     case V2DI_FTYPE_V16QI:
   12452              :     case V2DI_FTYPE_V8HI:
   12453              :     case V2DI_FTYPE_V4SI:
   12454              :     case V2DF_FTYPE_V2DF:
   12455              :     case V2DF_FTYPE_V4SI:
   12456              :     case V2DF_FTYPE_V4DF:
   12457              :     case V2DF_FTYPE_V4SF:
   12458              :     case V2DF_FTYPE_V2SI:
   12459              :     case V2SI_FTYPE_V2SI:
   12460              :     case V2SI_FTYPE_V4SF:
   12461              :     case V2SI_FTYPE_V2SF:
   12462              :     case V2SI_FTYPE_V2DF:
   12463              :     case V2SF_FTYPE_V2SF:
   12464              :     case V2SF_FTYPE_V2SI:
   12465              :     case V32QI_FTYPE_V32QI:
   12466              :     case V32QI_FTYPE_V16QI:
   12467              :     case V16HI_FTYPE_V16HI:
   12468              :     case V16HI_FTYPE_V8HI:
   12469              :     case V8SI_FTYPE_V8SI:
   12470              :     case V16HI_FTYPE_V16QI:
   12471              :     case V8SI_FTYPE_V16QI:
   12472              :     case V4DI_FTYPE_V16QI:
   12473              :     case V8SI_FTYPE_V8HI:
   12474              :     case V4DI_FTYPE_V8HI:
   12475              :     case V4DI_FTYPE_V4SI:
   12476              :     case V4DI_FTYPE_V2DI:
   12477              :     case UQI_FTYPE_UQI:
   12478              :     case UHI_FTYPE_UHI:
   12479              :     case USI_FTYPE_USI:
   12480              :     case USI_FTYPE_UQI:
   12481              :     case USI_FTYPE_UHI:
   12482              :     case UDI_FTYPE_UDI:
   12483              :     case UHI_FTYPE_V16QI:
   12484              :     case USI_FTYPE_V32QI:
   12485              :     case UDI_FTYPE_V64QI:
   12486              :     case V16QI_FTYPE_UHI:
   12487              :     case V32QI_FTYPE_USI:
   12488              :     case V64QI_FTYPE_UDI:
   12489              :     case V8HI_FTYPE_UQI:
   12490              :     case V16HI_FTYPE_UHI:
   12491              :     case V32HI_FTYPE_USI:
   12492              :     case V4SI_FTYPE_UQI:
   12493              :     case V8SI_FTYPE_UQI:
   12494              :     case V4SI_FTYPE_UHI:
   12495              :     case V8SI_FTYPE_UHI:
   12496              :     case UQI_FTYPE_V8HI:
   12497              :     case UHI_FTYPE_V16HI:
   12498              :     case USI_FTYPE_V32HI:
   12499              :     case UQI_FTYPE_V4SI:
   12500              :     case UQI_FTYPE_V8SI:
   12501              :     case UHI_FTYPE_V16SI:
   12502              :     case UQI_FTYPE_V2DI:
   12503              :     case UQI_FTYPE_V4DI:
   12504              :     case UQI_FTYPE_V8DI:
   12505              :     case V16SI_FTYPE_UHI:
   12506              :     case V2DI_FTYPE_UQI:
   12507              :     case V4DI_FTYPE_UQI:
   12508              :     case V16SI_FTYPE_INT:
   12509              :     case V16SF_FTYPE_V8SF:
   12510              :     case V16SI_FTYPE_V8SI:
   12511              :     case V16SF_FTYPE_V4SF:
   12512              :     case V16SI_FTYPE_V4SI:
   12513              :     case V16SI_FTYPE_V16SF:
   12514              :     case V16SI_FTYPE_V16SI:
   12515              :     case V64QI_FTYPE_V64QI:
   12516              :     case V32HI_FTYPE_V32HI:
   12517              :     case V16SF_FTYPE_V16SF:
   12518              :     case V8DI_FTYPE_UQI:
   12519              :     case V8DI_FTYPE_V8DI:
   12520              :     case V8DF_FTYPE_V4DF:
   12521              :     case V8DF_FTYPE_V2DF:
   12522              :     case V8DF_FTYPE_V8DF:
   12523              :     case V4DI_FTYPE_V4DI:
   12524              :     case V16BF_FTYPE_V16SF:
   12525              :     case V8BF_FTYPE_V8SF:
   12526              :     case V8BF_FTYPE_V4SF:
   12527              :       nargs = 1;
   12528              :       break;
   12529           52 :     case V4SF_FTYPE_V4SF_VEC_MERGE:
   12530           52 :     case V2DF_FTYPE_V2DF_VEC_MERGE:
   12531           52 :       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
   12532         9528 :     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
   12533         9528 :     case V16QI_FTYPE_V16QI_V16QI:
   12534         9528 :     case V16QI_FTYPE_V8HI_V8HI:
   12535         9528 :     case V16HF_FTYPE_V16HF_V16HF:
   12536         9528 :     case V16SF_FTYPE_V16SF_V16SF:
   12537         9528 :     case V16SI_FTYPE_V16SI_V16SI:
   12538         9528 :     case V8QI_FTYPE_V8QI_V8QI:
   12539         9528 :     case V8QI_FTYPE_V4HI_V4HI:
   12540         9528 :     case V8HI_FTYPE_V8HI_V8HI:
   12541         9528 :     case V8HI_FTYPE_V16QI_V16QI:
   12542         9528 :     case V8HI_FTYPE_V4SI_V4SI:
   12543         9528 :     case V8HF_FTYPE_V8HF_V8HF:
   12544         9528 :     case V8SF_FTYPE_V8SF_V8SF:
   12545         9528 :     case V8SF_FTYPE_V8SF_V8SI:
   12546         9528 :     case V8DF_FTYPE_V8DF_V8DF:
   12547         9528 :     case V4SI_FTYPE_V4SI_V4SI:
   12548         9528 :     case V4SI_FTYPE_V8HI_V8HI:
   12549         9528 :     case V4SI_FTYPE_V2DF_V2DF:
   12550         9528 :     case V4HI_FTYPE_V4HI_V4HI:
   12551         9528 :     case V4HI_FTYPE_V8QI_V8QI:
   12552         9528 :     case V4HI_FTYPE_V2SI_V2SI:
   12553         9528 :     case V4DF_FTYPE_V4DF_V4DF:
   12554         9528 :     case V4DF_FTYPE_V4DF_V4DI:
   12555         9528 :     case V4SF_FTYPE_V4SF_V4SF:
   12556         9528 :     case V4SF_FTYPE_V4SF_V4SI:
   12557         9528 :     case V4SF_FTYPE_V4SF_V2SI:
   12558         9528 :     case V4SF_FTYPE_V4SF_V2DF:
   12559         9528 :     case V4SF_FTYPE_V4SF_UINT:
   12560         9528 :     case V4SF_FTYPE_V4SF_DI:
   12561         9528 :     case V4SF_FTYPE_V4SF_SI:
   12562         9528 :     case V4DI_FTYPE_V4DI_V2DI:
   12563         9528 :     case V2DI_FTYPE_V2DI_V2DI:
   12564         9528 :     case V2DI_FTYPE_V16QI_V16QI:
   12565         9528 :     case V2DI_FTYPE_V4SI_V4SI:
   12566         9528 :     case V2DI_FTYPE_V2DI_V16QI:
   12567         9528 :     case V2SI_FTYPE_V2SI_V2SI:
   12568         9528 :     case V2SI_FTYPE_V4HI_V4HI:
   12569         9528 :     case V2SI_FTYPE_V2SF_V2SF:
   12570         9528 :     case V2DF_FTYPE_V2DF_V2DF:
   12571         9528 :     case V2DF_FTYPE_V2DF_V4SF:
   12572         9528 :     case V2DF_FTYPE_V2DF_V2DI:
   12573         9528 :     case V2DF_FTYPE_V2DF_DI:
   12574         9528 :     case V2DF_FTYPE_V2DF_SI:
   12575         9528 :     case V2DF_FTYPE_V2DF_UINT:
   12576         9528 :     case V2SF_FTYPE_V2SF_V2SF:
   12577         9528 :     case V1DI_FTYPE_V1DI_V1DI:
   12578         9528 :     case V1DI_FTYPE_V8QI_V8QI:
   12579         9528 :     case V1DI_FTYPE_V2SI_V2SI:
   12580         9528 :     case V32QI_FTYPE_V16HI_V16HI:
   12581         9528 :     case V16HI_FTYPE_V8SI_V8SI:
   12582         9528 :     case V64QI_FTYPE_V64QI_V64QI:
   12583         9528 :     case V32QI_FTYPE_V32QI_V32QI:
   12584         9528 :     case V32BF_FTYPE_V32BF_V32BF:
   12585         9528 :     case V16BF_FTYPE_V16BF_V16BF:
   12586         9528 :     case V8BF_FTYPE_V8BF_V8BF:
   12587         9528 :     case V16HI_FTYPE_V32QI_V32QI:
   12588         9528 :     case V16HI_FTYPE_V16HI_V16HI:
   12589         9528 :     case V8SI_FTYPE_V4DF_V4DF:
   12590         9528 :     case V8SI_FTYPE_V8SI_V8SI:
   12591         9528 :     case V8SI_FTYPE_V16HI_V16HI:
   12592         9528 :     case V4DI_FTYPE_V4DI_V4DI:
   12593         9528 :     case V4DI_FTYPE_V8SI_V8SI:
   12594         9528 :     case V4DI_FTYPE_V32QI_V32QI:
   12595         9528 :     case V8DI_FTYPE_V64QI_V64QI:
   12596         9528 :       if (comparison == UNKNOWN)
   12597         8994 :         return ix86_expand_binop_builtin (icode, exp, target);
   12598              :       nargs = 2;
   12599              :       break;
   12600           80 :     case V4SF_FTYPE_V4SF_V4SF_SWAP:
   12601           80 :     case V2DF_FTYPE_V2DF_V2DF_SWAP:
   12602           80 :       gcc_assert (comparison != UNKNOWN);
   12603              :       nargs = 2;
   12604              :       swap = true;
   12605              :       break;
   12606         1481 :     case V16HI_FTYPE_V16HI_V8HI_COUNT:
   12607         1481 :     case V16HI_FTYPE_V16HI_SI_COUNT:
   12608         1481 :     case V8SI_FTYPE_V8SI_V4SI_COUNT:
   12609         1481 :     case V8SI_FTYPE_V8SI_SI_COUNT:
   12610         1481 :     case V4DI_FTYPE_V4DI_V2DI_COUNT:
   12611         1481 :     case V4DI_FTYPE_V4DI_INT_COUNT:
   12612         1481 :     case V8HI_FTYPE_V8HI_V8HI_COUNT:
   12613         1481 :     case V8HI_FTYPE_V8HI_SI_COUNT:
   12614         1481 :     case V4SI_FTYPE_V4SI_V4SI_COUNT:
   12615         1481 :     case V4SI_FTYPE_V4SI_SI_COUNT:
   12616         1481 :     case V4HI_FTYPE_V4HI_V4HI_COUNT:
   12617         1481 :     case V4HI_FTYPE_V4HI_SI_COUNT:
   12618         1481 :     case V2DI_FTYPE_V2DI_V2DI_COUNT:
   12619         1481 :     case V2DI_FTYPE_V2DI_SI_COUNT:
   12620         1481 :     case V2SI_FTYPE_V2SI_V2SI_COUNT:
   12621         1481 :     case V2SI_FTYPE_V2SI_SI_COUNT:
   12622         1481 :     case V1DI_FTYPE_V1DI_V1DI_COUNT:
   12623         1481 :     case V1DI_FTYPE_V1DI_SI_COUNT:
   12624         1481 :       nargs = 2;
   12625         1481 :       second_arg_count = true;
   12626         1481 :       break;
   12627         1408 :     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
   12628         1408 :     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
   12629         1408 :     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
   12630         1408 :     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
   12631         1408 :     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
   12632         1408 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
   12633         1408 :     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
   12634         1408 :     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
   12635         1408 :     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
   12636         1408 :     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
   12637         1408 :     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
   12638         1408 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
   12639         1408 :     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
   12640         1408 :     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
   12641         1408 :     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
   12642         1408 :     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
   12643         1408 :     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
   12644         1408 :     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
   12645         1408 :       nargs = 4;
   12646         1408 :       second_arg_count = true;
   12647         1408 :       break;
   12648          967 :     case UINT64_FTYPE_UINT64_UINT64:
   12649          967 :     case UINT_FTYPE_UINT_UINT:
   12650          967 :     case UINT_FTYPE_UINT_USHORT:
   12651          967 :     case UINT_FTYPE_UINT_UCHAR:
   12652          967 :     case UINT16_FTYPE_UINT16_INT:
   12653          967 :     case UINT8_FTYPE_UINT8_INT:
   12654          967 :     case UQI_FTYPE_UQI_UQI:
   12655          967 :     case UHI_FTYPE_UHI_UHI:
   12656          967 :     case USI_FTYPE_USI_USI:
   12657          967 :     case UDI_FTYPE_UDI_UDI:
   12658          967 :     case V16SI_FTYPE_V8DF_V8DF:
   12659          967 :     case V32BF_FTYPE_V16SF_V16SF:
   12660          967 :     case V16BF_FTYPE_V8SF_V8SF:
   12661          967 :     case V8BF_FTYPE_V4SF_V4SF:
   12662          967 :     case V16BF_FTYPE_V16SF_UHI:
   12663          967 :     case V8BF_FTYPE_V8SF_UQI:
   12664          967 :     case V8BF_FTYPE_V4SF_UQI:
   12665          967 :     case V16QI_FTYPE_V16QI_V8HF:
   12666          967 :       nargs = 2;
   12667          967 :       break;
   12668          786 :     case V2DI_FTYPE_V2DI_INT_CONVERT:
   12669          786 :       nargs = 2;
   12670          786 :       rmode = V1TImode;
   12671          786 :       nargs_constant = 1;
   12672          786 :       break;
   12673           42 :     case V4DI_FTYPE_V4DI_INT_CONVERT:
   12674           42 :       nargs = 2;
   12675           42 :       rmode = V2TImode;
   12676           42 :       nargs_constant = 1;
   12677           42 :       break;
   12678           16 :     case V8DI_FTYPE_V8DI_INT_CONVERT:
   12679           16 :       nargs = 2;
   12680           16 :       rmode = V4TImode;
   12681           16 :       nargs_constant = 1;
   12682           16 :       break;
   12683         2424 :     case V8HI_FTYPE_V8HI_INT:
   12684         2424 :     case V8HI_FTYPE_V8SF_INT:
   12685         2424 :     case V16HI_FTYPE_V16SF_INT:
   12686         2424 :     case V8HI_FTYPE_V4SF_INT:
   12687         2424 :     case V8SF_FTYPE_V8SF_INT:
   12688         2424 :     case V4SF_FTYPE_V16SF_INT:
   12689         2424 :     case V16SF_FTYPE_V16SF_INT:
   12690         2424 :     case V4SI_FTYPE_V4SI_INT:
   12691         2424 :     case V4SI_FTYPE_V8SI_INT:
   12692         2424 :     case V4HI_FTYPE_V4HI_INT:
   12693         2424 :     case V4DF_FTYPE_V4DF_INT:
   12694         2424 :     case V4DF_FTYPE_V8DF_INT:
   12695         2424 :     case V4SF_FTYPE_V4SF_INT:
   12696         2424 :     case V4SF_FTYPE_V8SF_INT:
   12697         2424 :     case V2DI_FTYPE_V2DI_INT:
   12698         2424 :     case V2DF_FTYPE_V2DF_INT:
   12699         2424 :     case V2DF_FTYPE_V4DF_INT:
   12700         2424 :     case V16HI_FTYPE_V16HI_INT:
   12701         2424 :     case V8SI_FTYPE_V8SI_INT:
   12702         2424 :     case V16SI_FTYPE_V16SI_INT:
   12703         2424 :     case V4SI_FTYPE_V16SI_INT:
   12704         2424 :     case V4DI_FTYPE_V4DI_INT:
   12705         2424 :     case V2DI_FTYPE_V4DI_INT:
   12706         2424 :     case V4DI_FTYPE_V8DI_INT:
   12707         2424 :     case UQI_FTYPE_UQI_UQI_CONST:
   12708         2424 :     case UHI_FTYPE_UHI_UQI:
   12709         2424 :     case USI_FTYPE_USI_UQI:
   12710         2424 :     case UDI_FTYPE_UDI_UQI:
   12711         2424 :       nargs = 2;
   12712         2424 :       nargs_constant = 1;
   12713         2424 :       break;
   12714        18713 :     case V16QI_FTYPE_V16QI_V16QI_V16QI:
   12715        18713 :     case V8SF_FTYPE_V8SF_V8SF_V8SF:
   12716        18713 :     case V4DF_FTYPE_V4DF_V4DF_V4DF:
   12717        18713 :     case V4SF_FTYPE_V4SF_V4SF_V4SF:
   12718        18713 :     case V2DF_FTYPE_V2DF_V2DF_V2DF:
   12719        18713 :     case V32QI_FTYPE_V32QI_V32QI_V32QI:
   12720        18713 :     case UHI_FTYPE_V16SI_V16SI_UHI:
   12721        18713 :     case UQI_FTYPE_V8DI_V8DI_UQI:
   12722        18713 :     case V16HI_FTYPE_V16SI_V16HI_UHI:
   12723        18713 :     case V16QI_FTYPE_V16SI_V16QI_UHI:
   12724        18713 :     case V16QI_FTYPE_V8DI_V16QI_UQI:
   12725        18713 :     case V32HF_FTYPE_V32HF_V32HF_USI:
   12726        18713 :     case V16SF_FTYPE_V16SF_V16SF_UHI:
   12727        18713 :     case V16SF_FTYPE_V4SF_V16SF_UHI:
   12728        18713 :     case V16SI_FTYPE_SI_V16SI_UHI:
   12729        18713 :     case V16SI_FTYPE_V16HI_V16SI_UHI:
   12730        18713 :     case V16SI_FTYPE_V16QI_V16SI_UHI:
   12731        18713 :     case V8SF_FTYPE_V4SF_V8SF_UQI:
   12732        18713 :     case V4DF_FTYPE_V2DF_V4DF_UQI:
   12733        18713 :     case V8SI_FTYPE_V4SI_V8SI_UQI:
   12734        18713 :     case V8SI_FTYPE_SI_V8SI_UQI:
   12735        18713 :     case V4SI_FTYPE_V4SI_V4SI_UQI:
   12736        18713 :     case V4SI_FTYPE_SI_V4SI_UQI:
   12737        18713 :     case V4DI_FTYPE_V2DI_V4DI_UQI:
   12738        18713 :     case V4DI_FTYPE_DI_V4DI_UQI:
   12739        18713 :     case V2DI_FTYPE_V2DI_V2DI_UQI:
   12740        18713 :     case V2DI_FTYPE_DI_V2DI_UQI:
   12741        18713 :     case V64QI_FTYPE_V64QI_V64QI_UDI:
   12742        18713 :     case V64QI_FTYPE_V16QI_V64QI_UDI:
   12743        18713 :     case V64QI_FTYPE_QI_V64QI_UDI:
   12744        18713 :     case V32QI_FTYPE_V32QI_V32QI_USI:
   12745        18713 :     case V32QI_FTYPE_V16QI_V32QI_USI:
   12746        18713 :     case V32QI_FTYPE_QI_V32QI_USI:
   12747        18713 :     case V16QI_FTYPE_V16QI_V16QI_UHI:
   12748        18713 :     case V16QI_FTYPE_QI_V16QI_UHI:
   12749        18713 :     case V32HI_FTYPE_V8HI_V32HI_USI:
   12750        18713 :     case V32HI_FTYPE_V32BF_V32HI_USI:
   12751        18713 :     case V32HI_FTYPE_HI_V32HI_USI:
   12752        18713 :     case V16HI_FTYPE_V8HI_V16HI_UHI:
   12753        18713 :     case V16HI_FTYPE_V16BF_V16HI_UHI:
   12754        18713 :     case V16HI_FTYPE_HI_V16HI_UHI:
   12755        18713 :     case V8HI_FTYPE_V8HI_V8HI_UQI:
   12756        18713 :     case V8HI_FTYPE_V8BF_V8HI_UQI:
   12757        18713 :     case V8BF_FTYPE_V8BF_V8BF_UQI:
   12758        18713 :     case V8HI_FTYPE_HI_V8HI_UQI:
   12759        18713 :     case V16HF_FTYPE_V16HF_V16HF_UHI:
   12760        18713 :     case V8SF_FTYPE_V8HI_V8SF_UQI:
   12761        18713 :     case V4SF_FTYPE_V8HI_V4SF_UQI:
   12762        18713 :     case V8SI_FTYPE_V8HF_V8SI_UQI:
   12763        18713 :     case V8SF_FTYPE_V8HF_V8SF_UQI:
   12764        18713 :     case V8SI_FTYPE_V8SF_V8SI_UQI:
   12765        18713 :     case V4SI_FTYPE_V4SF_V4SI_UQI:
   12766        18713 :     case V4SI_FTYPE_V8HF_V4SI_UQI:
   12767        18713 :     case V4SF_FTYPE_V8HF_V4SF_UQI:
   12768        18713 :     case V4DI_FTYPE_V8HF_V4DI_UQI:
   12769        18713 :     case V4DI_FTYPE_V4SF_V4DI_UQI:
   12770        18713 :     case V2DI_FTYPE_V8HF_V2DI_UQI:
   12771        18713 :     case V2DI_FTYPE_V4SF_V2DI_UQI:
   12772        18713 :     case V8HF_FTYPE_V8HF_V8HF_UQI:
   12773        18713 :     case V8HF_FTYPE_V8HF_V8HF_V8HF:
   12774        18713 :     case V8HF_FTYPE_V8HI_V8HF_UQI:
   12775        18713 :     case V8HF_FTYPE_V8SI_V8HF_UQI:
   12776        18713 :     case V8HF_FTYPE_V8SF_V8HF_UQI:
   12777        18713 :     case V8HF_FTYPE_V4SI_V8HF_UQI:
   12778        18713 :     case V8HF_FTYPE_V4SF_V8HF_UQI:
   12779        18713 :     case V8HF_FTYPE_V4DI_V8HF_UQI:
   12780        18713 :     case V8HF_FTYPE_V4DF_V8HF_UQI:
   12781        18713 :     case V8HF_FTYPE_V2DI_V8HF_UQI:
   12782        18713 :     case V8HF_FTYPE_V2DF_V8HF_UQI:
   12783        18713 :     case V4SF_FTYPE_V4DI_V4SF_UQI:
   12784        18713 :     case V4SF_FTYPE_V2DI_V4SF_UQI:
   12785        18713 :     case V4DF_FTYPE_V4DI_V4DF_UQI:
   12786        18713 :     case V4DF_FTYPE_V8HF_V4DF_UQI:
   12787        18713 :     case V2DF_FTYPE_V8HF_V2DF_UQI:
   12788        18713 :     case V2DF_FTYPE_V2DI_V2DF_UQI:
   12789        18713 :     case V16QI_FTYPE_V8HI_V16QI_UQI:
   12790        18713 :     case V16QI_FTYPE_V16HI_V16QI_UHI:
   12791        18713 :     case V16QI_FTYPE_V4SI_V16QI_UQI:
   12792        18713 :     case V16QI_FTYPE_V8SI_V16QI_UQI:
   12793        18713 :     case V8HI_FTYPE_V8HF_V8HI_UQI:
   12794        18713 :     case V8HI_FTYPE_V4SI_V8HI_UQI:
   12795        18713 :     case V8HI_FTYPE_V8SI_V8HI_UQI:
   12796        18713 :     case V16QI_FTYPE_V2DI_V16QI_UQI:
   12797        18713 :     case V16QI_FTYPE_V4DI_V16QI_UQI:
   12798        18713 :     case V8HI_FTYPE_V2DI_V8HI_UQI:
   12799        18713 :     case V8HI_FTYPE_V4DI_V8HI_UQI:
   12800        18713 :     case V4SI_FTYPE_V2DI_V4SI_UQI:
   12801        18713 :     case V4SI_FTYPE_V4DI_V4SI_UQI:
   12802        18713 :     case V32QI_FTYPE_V32HI_V32QI_USI:
   12803        18713 :     case UHI_FTYPE_V16QI_V16QI_UHI:
   12804        18713 :     case USI_FTYPE_V32QI_V32QI_USI:
   12805        18713 :     case UDI_FTYPE_V64QI_V64QI_UDI:
   12806        18713 :     case UQI_FTYPE_V8HI_V8HI_UQI:
   12807        18713 :     case UHI_FTYPE_V16HI_V16HI_UHI:
   12808        18713 :     case USI_FTYPE_V32HI_V32HI_USI:
   12809        18713 :     case UQI_FTYPE_V4SI_V4SI_UQI:
   12810        18713 :     case UQI_FTYPE_V8SI_V8SI_UQI:
   12811        18713 :     case UQI_FTYPE_V2DI_V2DI_UQI:
   12812        18713 :     case UQI_FTYPE_V4DI_V4DI_UQI:
   12813        18713 :     case V4SF_FTYPE_V2DF_V4SF_UQI:
   12814        18713 :     case V4SF_FTYPE_V4DF_V4SF_UQI:
   12815        18713 :     case V16SI_FTYPE_V16SI_V16SI_UHI:
   12816        18713 :     case V16SI_FTYPE_V4SI_V16SI_UHI:
   12817        18713 :     case V2DI_FTYPE_V4SI_V2DI_UQI:
   12818        18713 :     case V2DI_FTYPE_V8HI_V2DI_UQI:
   12819        18713 :     case V2DI_FTYPE_V16QI_V2DI_UQI:
   12820        18713 :     case V4DI_FTYPE_V4DI_V4DI_UQI:
   12821        18713 :     case V4DI_FTYPE_V4SI_V4DI_UQI:
   12822        18713 :     case V4DI_FTYPE_V8HI_V4DI_UQI:
   12823        18713 :     case V4DI_FTYPE_V16QI_V4DI_UQI:
   12824        18713 :     case V4DI_FTYPE_V4DF_V4DI_UQI:
   12825        18713 :     case V2DI_FTYPE_V2DF_V2DI_UQI:
   12826        18713 :     case V4SI_FTYPE_V4DF_V4SI_UQI:
   12827        18713 :     case V4SI_FTYPE_V2DF_V4SI_UQI:
   12828        18713 :     case V4SI_FTYPE_V8HI_V4SI_UQI:
   12829        18713 :     case V4SI_FTYPE_V16QI_V4SI_UQI:
   12830        18713 :     case V4DI_FTYPE_V4DI_V4DI_V4DI:
   12831        18713 :     case V8DF_FTYPE_V2DF_V8DF_UQI:
   12832        18713 :     case V8DF_FTYPE_V4DF_V8DF_UQI:
   12833        18713 :     case V8DF_FTYPE_V8DF_V8DF_UQI:
   12834        18713 :     case V8SF_FTYPE_V8SF_V8SF_UQI:
   12835        18713 :     case V8SF_FTYPE_V8SI_V8SF_UQI:
   12836        18713 :     case V4DF_FTYPE_V4DF_V4DF_UQI:
   12837        18713 :     case V4SF_FTYPE_V4SF_V4SF_UQI:
   12838        18713 :     case V2DF_FTYPE_V2DF_V2DF_UQI:
   12839        18713 :     case V2DF_FTYPE_V4SF_V2DF_UQI:
   12840        18713 :     case V2DF_FTYPE_V4SI_V2DF_UQI:
   12841        18713 :     case V4SF_FTYPE_V4SI_V4SF_UQI:
   12842        18713 :     case V4DF_FTYPE_V4SF_V4DF_UQI:
   12843        18713 :     case V4DF_FTYPE_V4SI_V4DF_UQI:
   12844        18713 :     case V8SI_FTYPE_V8SI_V8SI_UQI:
   12845        18713 :     case V8SI_FTYPE_V8HI_V8SI_UQI:
   12846        18713 :     case V8SI_FTYPE_V16QI_V8SI_UQI:
   12847        18713 :     case V8DF_FTYPE_V8SI_V8DF_UQI:
   12848        18713 :     case V8DI_FTYPE_DI_V8DI_UQI:
   12849        18713 :     case V16SF_FTYPE_V8SF_V16SF_UHI:
   12850        18713 :     case V16SI_FTYPE_V8SI_V16SI_UHI:
   12851        18713 :     case V16HF_FTYPE_V16HI_V16HF_UHI:
   12852        18713 :     case V16HF_FTYPE_V16HF_V16HF_V16HF:
   12853        18713 :     case V16HI_FTYPE_V16HF_V16HI_UHI:
   12854        18713 :     case V16HI_FTYPE_V16HI_V16HI_UHI:
   12855        18713 :     case V16BF_FTYPE_V16BF_V16BF_UHI:
   12856        18713 :     case V8HI_FTYPE_V16QI_V8HI_UQI:
   12857        18713 :     case V16HI_FTYPE_V16QI_V16HI_UHI:
   12858        18713 :     case V32HI_FTYPE_V32HI_V32HI_USI:
   12859        18713 :     case V32BF_FTYPE_V32BF_V32BF_USI:
   12860        18713 :     case V32HI_FTYPE_V32QI_V32HI_USI:
   12861        18713 :     case V8DI_FTYPE_V16QI_V8DI_UQI:
   12862        18713 :     case V8DI_FTYPE_V2DI_V8DI_UQI:
   12863        18713 :     case V8DI_FTYPE_V4DI_V8DI_UQI:
   12864        18713 :     case V8DI_FTYPE_V8DI_V8DI_UQI:
   12865        18713 :     case V8DI_FTYPE_V8HI_V8DI_UQI:
   12866        18713 :     case V8DI_FTYPE_V8SI_V8DI_UQI:
   12867        18713 :     case V8HI_FTYPE_V8DI_V8HI_UQI:
   12868        18713 :     case V8SI_FTYPE_V8DI_V8SI_UQI:
   12869        18713 :     case V4SI_FTYPE_V4SI_V4SI_V4SI:
   12870        18713 :     case V4DI_FTYPE_V4DI_V4DI_V2DI:
   12871        18713 :     case V16SI_FTYPE_V16SI_V16SI_V16SI:
   12872        18713 :     case V8DI_FTYPE_V8DI_V8DI_V8DI:
   12873        18713 :     case V32HI_FTYPE_V32HI_V32HI_V32HI:
   12874        18713 :     case V2DI_FTYPE_V2DI_V2DI_V2DI:
   12875        18713 :     case V16HI_FTYPE_V16HI_V16HI_V16HI:
   12876        18713 :     case V8SI_FTYPE_V8SI_V8SI_V8SI:
   12877        18713 :     case V8HI_FTYPE_V8HI_V8HI_V8HI:
   12878        18713 :     case V32BF_FTYPE_V16SF_V16SF_USI:
   12879        18713 :     case V16BF_FTYPE_V8SF_V8SF_UHI:
   12880        18713 :     case V8BF_FTYPE_V4SF_V4SF_UQI:
   12881        18713 :     case V16BF_FTYPE_V16SF_V16BF_UHI:
   12882        18713 :     case V8BF_FTYPE_V8SF_V8BF_UQI:
   12883        18713 :     case V8BF_FTYPE_V4SF_V8BF_UQI:
   12884        18713 :     case V16SF_FTYPE_V16SF_V32BF_V32BF:
   12885        18713 :     case V8SF_FTYPE_V8SF_V16BF_V16BF:
   12886        18713 :     case V4SF_FTYPE_V4SF_V8BF_V8BF:
   12887        18713 :     case V16QI_FTYPE_V16QI_V8HF_V8HF:
   12888        18713 :     case V32QI_FTYPE_V32QI_V16HF_V16HF:
   12889        18713 :     case V64QI_FTYPE_V64QI_V32HF_V32HF:
   12890        18713 :     case V16QI_FTYPE_V8HF_V16QI_UQI:
   12891        18713 :     case V16QI_FTYPE_V16HF_V16QI_UHI:
   12892        18713 :     case V32QI_FTYPE_V32HF_V32QI_USI:
   12893        18713 :     case V8HF_FTYPE_V16QI_V8HF_UQI:
   12894        18713 :     case V16HF_FTYPE_V16QI_V16HF_UHI:
   12895        18713 :     case V32HF_FTYPE_V32QI_V32HF_USI:
   12896        18713 :     case V16SI_FTYPE_V16SF_V16SI_UHI:
   12897        18713 :     case V32HI_FTYPE_V32HF_V32HI_USI:
   12898        18713 :     case V8DI_FTYPE_V8SF_V8DI_UQI:
   12899        18713 :     case V8DI_FTYPE_V8DF_V8DI_UQI:
   12900        18713 :     case V8SI_FTYPE_V8DF_V8SI_UQI:
   12901        18713 :       nargs = 3;
   12902        18713 :       break;
   12903         1481 :     case V32QI_FTYPE_V32QI_V32QI_INT:
   12904         1481 :     case V16HI_FTYPE_V16HI_V16HI_INT:
   12905         1481 :     case V16QI_FTYPE_V16QI_V16QI_INT:
   12906         1481 :     case V4DI_FTYPE_V4DI_V4DI_INT:
   12907         1481 :     case V8HI_FTYPE_V8HI_V8HI_INT:
   12908         1481 :     case V8SI_FTYPE_V8SI_V8SI_INT:
   12909         1481 :     case V8SI_FTYPE_V8SI_V4SI_INT:
   12910         1481 :     case V8SF_FTYPE_V8SF_V8SF_INT:
   12911         1481 :     case V8SF_FTYPE_V8SF_V4SF_INT:
   12912         1481 :     case V4SI_FTYPE_V4SI_V4SI_INT:
   12913         1481 :     case V4DF_FTYPE_V4DF_V4DF_INT:
   12914         1481 :     case V16SF_FTYPE_V16SF_V16SF_INT:
   12915         1481 :     case V16SF_FTYPE_V16SF_V4SF_INT:
   12916         1481 :     case V16SI_FTYPE_V16SI_V4SI_INT:
   12917         1481 :     case V4DF_FTYPE_V4DF_V2DF_INT:
   12918         1481 :     case V4SF_FTYPE_V4SF_V4SF_INT:
   12919         1481 :     case V2DI_FTYPE_V2DI_V2DI_INT:
   12920         1481 :     case V4DI_FTYPE_V4DI_V2DI_INT:
   12921         1481 :     case V2DF_FTYPE_V2DF_V2DF_INT:
   12922         1481 :     case UQI_FTYPE_V8DI_V8UDI_INT:
   12923         1481 :     case UQI_FTYPE_V8DF_V8DF_INT:
   12924         1481 :     case UQI_FTYPE_V2DF_V2DF_INT:
   12925         1481 :     case UQI_FTYPE_V4SF_V4SF_INT:
   12926         1481 :     case UHI_FTYPE_V16SI_V16SI_INT:
   12927         1481 :     case UHI_FTYPE_V16SF_V16SF_INT:
   12928         1481 :     case V64QI_FTYPE_V64QI_V64QI_INT:
   12929         1481 :     case V32HI_FTYPE_V32HI_V32HI_INT:
   12930         1481 :     case V16SI_FTYPE_V16SI_V16SI_INT:
   12931         1481 :     case V8DI_FTYPE_V8DI_V8DI_INT:
   12932         1481 :       nargs = 3;
   12933         1481 :       nargs_constant = 1;
   12934         1481 :       break;
   12935           47 :     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
   12936           47 :       nargs = 3;
   12937           47 :       rmode = V4DImode;
   12938           47 :       nargs_constant = 1;
   12939           47 :       break;
   12940           80 :     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
   12941           80 :       nargs = 3;
   12942           80 :       rmode = V2DImode;
   12943           80 :       nargs_constant = 1;
   12944           80 :       break;
   12945           48 :     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
   12946           48 :       nargs = 3;
   12947           48 :       rmode = DImode;
   12948           48 :       nargs_constant = 1;
   12949           48 :       break;
   12950           20 :     case V2DI_FTYPE_V2DI_UINT_UINT:
   12951           20 :       nargs = 3;
   12952           20 :       nargs_constant = 2;
   12953           20 :       break;
   12954            8 :     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
   12955            8 :       nargs = 3;
   12956            8 :       rmode = V8DImode;
   12957            8 :       nargs_constant = 1;
   12958            8 :       break;
   12959           16 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
   12960           16 :       nargs = 5;
   12961           16 :       rmode = V8DImode;
   12962           16 :       mask_pos = 2;
   12963           16 :       nargs_constant = 1;
   12964           16 :       break;
   12965          320 :     case QI_FTYPE_V8DF_INT_UQI:
   12966          320 :     case QI_FTYPE_V4DF_INT_UQI:
   12967          320 :     case QI_FTYPE_V2DF_INT_UQI:
   12968          320 :     case HI_FTYPE_V16SF_INT_UHI:
   12969          320 :     case QI_FTYPE_V8SF_INT_UQI:
   12970          320 :     case QI_FTYPE_V4SF_INT_UQI:
   12971          320 :     case QI_FTYPE_V8HF_INT_UQI:
   12972          320 :     case HI_FTYPE_V16HF_INT_UHI:
   12973          320 :     case SI_FTYPE_V32HF_INT_USI:
   12974          320 :     case QI_FTYPE_V8BF_INT_UQI:
   12975          320 :     case HI_FTYPE_V16BF_INT_UHI:
   12976          320 :     case SI_FTYPE_V32BF_INT_USI:
   12977          320 :     case V4SI_FTYPE_V4SI_V4SI_UHI:
   12978          320 :     case V8SI_FTYPE_V8SI_V8SI_UHI:
   12979          320 :       nargs = 3;
   12980          320 :       mask_pos = 1;
   12981          320 :       nargs_constant = 1;
   12982          320 :       break;
   12983           17 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
   12984           17 :       nargs = 5;
   12985           17 :       rmode = V4DImode;
   12986           17 :       mask_pos = 2;
   12987           17 :       nargs_constant = 1;
   12988           17 :       break;
   12989           17 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
   12990           17 :       nargs = 5;
   12991           17 :       rmode = V2DImode;
   12992           17 :       mask_pos = 2;
   12993           17 :       nargs_constant = 1;
   12994           17 :       break;
   12995        17264 :     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
   12996        17264 :     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
   12997        17264 :     case V32BF_FTYPE_V32BF_V32BF_V32BF_USI:
   12998        17264 :     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
   12999        17264 :     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
   13000        17264 :     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
   13001        17264 :     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
   13002        17264 :     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
   13003        17264 :     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
   13004        17264 :     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
   13005        17264 :     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
   13006        17264 :     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
   13007        17264 :     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
   13008        17264 :     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
   13009        17264 :     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
   13010        17264 :     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
   13011        17264 :     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
   13012        17264 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
   13013        17264 :     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
   13014        17264 :     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
   13015        17264 :     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
   13016        17264 :     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
   13017        17264 :     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
   13018        17264 :     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
   13019        17264 :     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
   13020        17264 :     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
   13021        17264 :     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
   13022        17264 :     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
   13023        17264 :     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
   13024        17264 :     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
   13025        17264 :     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
   13026        17264 :     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
   13027        17264 :     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
   13028        17264 :     case V8BF_FTYPE_V8BF_V8BF_V8BF_UQI:
   13029        17264 :     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
   13030        17264 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
   13031        17264 :     case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
   13032        17264 :     case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
   13033        17264 :     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
   13034        17264 :     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
   13035        17264 :     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
   13036        17264 :     case V16BF_FTYPE_V16BF_V16BF_V16BF_UHI:
   13037        17264 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
   13038        17264 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
   13039        17264 :     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
   13040        17264 :     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
   13041        17264 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
   13042        17264 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
   13043        17264 :     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
   13044        17264 :     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
   13045        17264 :     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
   13046        17264 :     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
   13047        17264 :     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
   13048        17264 :     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
   13049        17264 :     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
   13050        17264 :     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
   13051        17264 :     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
   13052        17264 :     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
   13053        17264 :     case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
   13054        17264 :     case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
   13055        17264 :     case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
   13056        17264 :     case V32HF_FTYPE_V16SF_V16SF_V32HF_USI:
   13057        17264 :     case V16HF_FTYPE_V8SF_V8SF_V16HF_UHI:
   13058        17264 :     case V8HF_FTYPE_V4SF_V4SF_V8HF_UQI:
   13059        17264 :     case V16QI_FTYPE_V8HF_V8HF_V16QI_UHI:
   13060        17264 :     case V32QI_FTYPE_V16HF_V16HF_V32QI_USI:
   13061        17264 :     case V64QI_FTYPE_V32HF_V32HF_V64QI_UDI:
   13062        17264 :     case V16QI_FTYPE_V16QI_V8HF_V16QI_UHI:
   13063        17264 :     case V16QI_FTYPE_V32QI_V16HF_V16QI_UHI:
   13064        17264 :     case V32QI_FTYPE_V64QI_V32HF_V32QI_USI:
   13065        17264 :       nargs = 4;
   13066        17264 :       break;
   13067           11 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
   13068           11 :     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
   13069           11 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
   13070           11 :     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
   13071           11 :     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
   13072           11 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
   13073           11 :       nargs = 4;
   13074           11 :       nargs_constant = 1;
   13075           11 :       break;
   13076         3718 :     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
   13077         3718 :     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
   13078         3718 :     case QI_FTYPE_V4DF_V4DF_INT_UQI:
   13079         3718 :     case QI_FTYPE_V8SF_V8SF_INT_UQI:
   13080         3718 :     case UHI_FTYPE_V16HF_V16HF_INT_UHI:
   13081         3718 :     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
   13082         3718 :     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
   13083         3718 :     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
   13084         3718 :     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
   13085         3718 :     case UQI_FTYPE_V8HF_V8HF_INT_UQI:
   13086         3718 :     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
   13087         3718 :     case USI_FTYPE_V32QI_V32QI_INT_USI:
   13088         3718 :     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
   13089         3718 :     case USI_FTYPE_V32HI_V32HI_INT_USI:
   13090         3718 :     case USI_FTYPE_V32BF_V32BF_INT_USI:
   13091         3718 :     case USI_FTYPE_V32HF_V32HF_INT_USI:
   13092         3718 :     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
   13093         3718 :     case UHI_FTYPE_V16BF_V16BF_INT_UHI:
   13094         3718 :     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
   13095         3718 :     case UQI_FTYPE_V8BF_V8BF_INT_UQI:
   13096         3718 :       nargs = 4;
   13097         3718 :       mask_pos = 1;
   13098         3718 :       nargs_constant = 1;
   13099         3718 :       break;
   13100           23 :     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
   13101           23 :       nargs = 4;
   13102           23 :       nargs_constant = 2;
   13103           23 :       break;
   13104           67 :     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
   13105           67 :     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
   13106           67 :     case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
   13107           67 :     case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
   13108           67 :     case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
   13109           67 :       nargs = 4;
   13110           67 :       break;
   13111          679 :     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
   13112          679 :     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
   13113          679 :       mask_pos = 1;
   13114          679 :       nargs = 4;
   13115          679 :       nargs_constant = 1;
   13116          679 :       break;
   13117         3948 :     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
   13118         3948 :     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
   13119         3948 :     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
   13120         3948 :     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
   13121         3948 :     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
   13122         3948 :     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
   13123         3948 :     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
   13124         3948 :     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
   13125         3948 :     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
   13126         3948 :     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
   13127         3948 :     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
   13128         3948 :     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
   13129         3948 :     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
   13130         3948 :     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
   13131         3948 :     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
   13132         3948 :     case V32BF_FTYPE_V32BF_INT_V32BF_USI:
   13133         3948 :     case V16BF_FTYPE_V16BF_INT_V16BF_UHI:
   13134         3948 :     case V8BF_FTYPE_V8BF_INT_V8BF_UQI:
   13135         3948 :     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
   13136         3948 :     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
   13137         3948 :     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
   13138         3948 :     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
   13139         3948 :     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
   13140         3948 :     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
   13141         3948 :     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
   13142         3948 :     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
   13143         3948 :     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
   13144         3948 :     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
   13145         3948 :     case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
   13146         3948 :     case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
   13147         3948 :     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
   13148         3948 :     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
   13149         3948 :     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
   13150         3948 :     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
   13151         3948 :     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
   13152         3948 :       nargs = 4;
   13153         3948 :       mask_pos = 2;
   13154         3948 :       nargs_constant = 1;
   13155         3948 :       break;
   13156         1726 :     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
   13157         1726 :     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
   13158         1726 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
   13159         1726 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
   13160         1726 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
   13161         1726 :     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
   13162         1726 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
   13163         1726 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
   13164         1726 :     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
   13165         1726 :     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
   13166         1726 :     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
   13167         1726 :     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
   13168         1726 :     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
   13169         1726 :     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
   13170         1726 :     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
   13171         1726 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
   13172         1726 :     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
   13173         1726 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
   13174         1726 :     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
   13175         1726 :     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
   13176         1726 :     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
   13177         1726 :     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
   13178         1726 :     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
   13179         1726 :     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
   13180         1726 :     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
   13181         1726 :     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
   13182         1726 :     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
   13183         1726 :       nargs = 5;
   13184         1726 :       mask_pos = 2;
   13185         1726 :       nargs_constant = 1;
   13186         1726 :       break;
   13187          268 :     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
   13188          268 :     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
   13189          268 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
   13190          268 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
   13191          268 :     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
   13192          268 :     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
   13193          268 :     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
   13194          268 :     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
   13195          268 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
   13196          268 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
   13197          268 :       nargs = 5;
   13198          268 :       mask_pos = 1;
   13199          268 :       nargs_constant = 1;
   13200          268 :       break;
   13201          732 :     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
   13202          732 :     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
   13203          732 :     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
   13204          732 :     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
   13205          732 :     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
   13206          732 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
   13207          732 :     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
   13208          732 :     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
   13209          732 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
   13210          732 :     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
   13211          732 :     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
   13212          732 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
   13213          732 :     case V8BF_FTYPE_V8BF_V8BF_INT_V8BF_UQI:
   13214          732 :     case V16BF_FTYPE_V16BF_V16BF_INT_V16BF_UHI:
   13215          732 :     case V32BF_FTYPE_V32BF_V32BF_INT_V32BF_USI:
   13216          732 :     case V16HF_FTYPE_V16HF_V16HF_INT_V16HF_UHI:
   13217          732 :     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI:
   13218          732 :       nargs = 5;
   13219          732 :       mask_pos = 1;
   13220          732 :       nargs_constant = 2;
   13221          732 :       break;
   13222              : 
   13223            0 :     default:
   13224            0 :       gcc_unreachable ();
   13225              :     }
   13226              : 
   13227        56327 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   13228              : 
   13229        61704 :   if (comparison != UNKNOWN)
   13230              :     {
   13231          614 :       gcc_assert (nargs == 2);
   13232          614 :       return ix86_expand_sse_compare (d, exp, target, swap);
   13233              :     }
   13234              : 
   13235        61090 :   if (rmode == VOIDmode || rmode == tmode)
   13236              :     {
   13237        60905 :       if (optimize
   13238        17726 :           || target == 0
   13239        17726 :           || GET_MODE (target) != tmode
   13240        78429 :           || !insn_p->operand[0].predicate (target, tmode))
   13241        43469 :         target = gen_reg_rtx (tmode);
   13242        17436 :       else if (memory_operand (target, tmode))
   13243          578 :         num_memory++;
   13244              :       real_target = target;
   13245              :     }
   13246              :   else
   13247              :     {
   13248          185 :       real_target = gen_reg_rtx (tmode);
   13249          185 :       target = lowpart_subreg (rmode, real_target, tmode);
   13250              :     }
   13251              : 
   13252       261387 :   for (i = 0; i < nargs; i++)
   13253              :     {
   13254       200530 :       tree arg = CALL_EXPR_ARG (exp, i);
   13255       200530 :       rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
   13256       200530 :       machine_mode mode = insn_p->operand[i + 1].mode;
   13257              :       /* Need to fixup modeless constant before testing predicate.  */
   13258       200530 :       op = fixup_modeless_constant (op, mode);
   13259       200530 :       bool match = insn_p->operand[i + 1].predicate (op, mode);
   13260              : 
   13261       200530 :       if (second_arg_count && i == 1)
   13262              :         {
   13263              :           /* SIMD shift insns take either an 8-bit immediate or
   13264              :              register as count.  But builtin functions take int as
   13265              :              count.  If count doesn't match, we put it in register.
   13266              :              The instructions are using 64-bit count, if op is just
   13267              :              32-bit, zero-extend it, as negative shift counts
   13268              :              are undefined behavior and zero-extension is more
   13269              :              efficient.  */
   13270         2889 :           if (!match)
   13271              :             {
   13272         1750 :               if (SCALAR_INT_MODE_P (GET_MODE (op)))
   13273          489 :                 op = convert_modes (mode, GET_MODE (op), op, 1);
   13274              :               else
   13275         1261 :                 op = lowpart_subreg (mode, op, GET_MODE (op));
   13276         1750 :               if (!insn_p->operand[i + 1].predicate (op, mode))
   13277          190 :                 op = copy_to_reg (op);
   13278              :             }
   13279              :         }
   13280       197641 :       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
   13281       149585 :                (!mask_pos && (nargs - i) <= nargs_constant))
   13282              :         {
   13283        16466 :           if (!match)
   13284          233 :             switch (icode)
   13285              :               {
   13286            2 :               case CODE_FOR_avx_vinsertf128v4di:
   13287            2 :               case CODE_FOR_avx_vextractf128v4di:
   13288            2 :                 error ("the last argument must be an 1-bit immediate");
   13289            2 :                 return const0_rtx;
   13290              : 
   13291            8 :               case CODE_FOR_avx512f_cmpv8di3_mask:
   13292            8 :               case CODE_FOR_avx512f_cmpv16si3_mask:
   13293            8 :               case CODE_FOR_avx512f_ucmpv8di3_mask:
   13294            8 :               case CODE_FOR_avx512f_ucmpv16si3_mask:
   13295            8 :               case CODE_FOR_avx512vl_cmpv4di3_mask:
   13296            8 :               case CODE_FOR_avx512vl_cmpv8si3_mask:
   13297            8 :               case CODE_FOR_avx512vl_ucmpv4di3_mask:
   13298            8 :               case CODE_FOR_avx512vl_ucmpv8si3_mask:
   13299            8 :               case CODE_FOR_avx512vl_cmpv2di3_mask:
   13300            8 :               case CODE_FOR_avx512vl_cmpv4si3_mask:
   13301            8 :               case CODE_FOR_avx512vl_ucmpv2di3_mask:
   13302            8 :               case CODE_FOR_avx512vl_ucmpv4si3_mask:
   13303            8 :                 error ("the last argument must be a 3-bit immediate");
   13304            8 :                 return const0_rtx;
   13305              : 
   13306           24 :               case CODE_FOR_sse4_1_roundsd:
   13307           24 :               case CODE_FOR_sse4_1_roundss:
   13308              : 
   13309           24 :               case CODE_FOR_sse4_1_roundpd:
   13310           24 :               case CODE_FOR_sse4_1_roundps:
   13311           24 :               case CODE_FOR_avx_roundpd256:
   13312           24 :               case CODE_FOR_avx_roundps256:
   13313              : 
   13314           24 :               case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
   13315           24 :               case CODE_FOR_sse4_1_roundps_sfix:
   13316           24 :               case CODE_FOR_avx_roundpd_vec_pack_sfix256:
   13317           24 :               case CODE_FOR_avx_roundps_sfix256:
   13318              : 
   13319           24 :               case CODE_FOR_sse4_1_blendps:
   13320           24 :               case CODE_FOR_avx_blendpd256:
   13321           24 :               case CODE_FOR_avx_vpermilv4df:
   13322           24 :               case CODE_FOR_avx_vpermilv4df_mask:
   13323           24 :               case CODE_FOR_avx512f_getmantv8df_mask:
   13324           24 :               case CODE_FOR_avx512f_getmantv16sf_mask:
   13325           24 :               case CODE_FOR_avx512vl_getmantv16hf_mask:
   13326           24 :               case CODE_FOR_avx512vl_getmantv8sf_mask:
   13327           24 :               case CODE_FOR_avx512vl_getmantv4df_mask:
   13328           24 :               case CODE_FOR_avx512fp16_getmantv8hf_mask:
   13329           24 :               case CODE_FOR_avx512vl_getmantv4sf_mask:
   13330           24 :               case CODE_FOR_avx512vl_getmantv2df_mask:
   13331           24 :               case CODE_FOR_avx512dq_rangepv8df_mask_round:
   13332           24 :               case CODE_FOR_avx512dq_rangepv16sf_mask_round:
   13333           24 :               case CODE_FOR_avx512dq_rangepv4df_mask:
   13334           24 :               case CODE_FOR_avx512dq_rangepv8sf_mask:
   13335           24 :               case CODE_FOR_avx512dq_rangepv2df_mask:
   13336           24 :               case CODE_FOR_avx512dq_rangepv4sf_mask:
   13337           24 :               case CODE_FOR_avx_shufpd256_mask:
   13338           24 :                 error ("the last argument must be a 4-bit immediate");
   13339           24 :                 return const0_rtx;
   13340              : 
   13341           15 :               case CODE_FOR_sha1rnds4:
   13342           15 :               case CODE_FOR_sse4_1_blendpd:
   13343           15 :               case CODE_FOR_avx_vpermilv2df:
   13344           15 :               case CODE_FOR_avx_vpermilv2df_mask:
   13345           15 :               case CODE_FOR_xop_vpermil2v2df3:
   13346           15 :               case CODE_FOR_xop_vpermil2v4sf3:
   13347           15 :               case CODE_FOR_xop_vpermil2v4df3:
   13348           15 :               case CODE_FOR_xop_vpermil2v8sf3:
   13349           15 :               case CODE_FOR_avx512f_vinsertf32x4_mask:
   13350           15 :               case CODE_FOR_avx512f_vinserti32x4_mask:
   13351           15 :               case CODE_FOR_avx512f_vextractf32x4_mask:
   13352           15 :               case CODE_FOR_avx512f_vextracti32x4_mask:
   13353           15 :               case CODE_FOR_sse2_shufpd:
   13354           15 :               case CODE_FOR_sse2_shufpd_mask:
   13355           15 :               case CODE_FOR_avx512dq_shuf_f64x2_mask:
   13356           15 :               case CODE_FOR_avx512dq_shuf_i64x2_mask:
   13357           15 :               case CODE_FOR_avx512vl_shuf_i32x4_mask:
   13358           15 :               case CODE_FOR_avx512vl_shuf_f32x4_mask:
   13359           15 :                 error ("the last argument must be a 2-bit immediate");
   13360           15 :                 return const0_rtx;
   13361              : 
   13362           30 :               case CODE_FOR_avx_vextractf128v4df:
   13363           30 :               case CODE_FOR_avx_vextractf128v8sf:
   13364           30 :               case CODE_FOR_avx_vextractf128v8si:
   13365           30 :               case CODE_FOR_avx_vinsertf128v4df:
   13366           30 :               case CODE_FOR_avx_vinsertf128v8sf:
   13367           30 :               case CODE_FOR_avx_vinsertf128v8si:
   13368           30 :               case CODE_FOR_avx512f_vinsertf64x4_mask:
   13369           30 :               case CODE_FOR_avx512f_vinserti64x4_mask:
   13370           30 :               case CODE_FOR_avx512f_vextractf64x4_mask:
   13371           30 :               case CODE_FOR_avx512f_vextracti64x4_mask:
   13372           30 :               case CODE_FOR_avx512dq_vinsertf32x8_mask:
   13373           30 :               case CODE_FOR_avx512dq_vinserti32x8_mask:
   13374           30 :               case CODE_FOR_avx512vl_vinsertv4df:
   13375           30 :               case CODE_FOR_avx512vl_vinsertv4di:
   13376           30 :               case CODE_FOR_avx512vl_vinsertv8sf:
   13377           30 :               case CODE_FOR_avx512vl_vinsertv8si:
   13378           30 :                 error ("the last argument must be a 1-bit immediate");
   13379           30 :                 return const0_rtx;
   13380              : 
   13381           16 :               case CODE_FOR_avx_vmcmpv2df3:
   13382           16 :               case CODE_FOR_avx_vmcmpv4sf3:
   13383           16 :               case CODE_FOR_avx_cmpv2df3:
   13384           16 :               case CODE_FOR_avx_cmpv4sf3:
   13385           16 :                 if (CONST_INT_P (op) && IN_RANGE (INTVAL (op), 8, 31))
   13386              :                   {
   13387            4 :                     error ("'%s' needs isa option %s", d->name, "-mavx");
   13388            4 :                     return const0_rtx;
   13389              :                   }
   13390              :                 /* FALLTHRU */
   13391           18 :               case CODE_FOR_avx_cmpv4df3:
   13392           18 :               case CODE_FOR_avx_cmpv8sf3:
   13393           18 :               case CODE_FOR_avx512f_cmpv8df3_mask:
   13394           18 :               case CODE_FOR_avx512f_cmpv16sf3_mask:
   13395           18 :               case CODE_FOR_avx512f_vmcmpv2df3_mask:
   13396           18 :               case CODE_FOR_avx512f_vmcmpv4sf3_mask:
   13397           18 :               case CODE_FOR_avx512bw_cmpv32hf3_mask:
   13398           18 :               case CODE_FOR_avx512vl_cmpv16hf3_mask:
   13399           18 :               case CODE_FOR_avx512fp16_cmpv8hf3_mask:
   13400           18 :                 error ("the last argument must be a 5-bit immediate");
   13401           18 :                 return const0_rtx;
   13402              : 
   13403          132 :               default:
   13404          132 :                 switch (nargs_constant)
   13405              :                   {
   13406            8 :                   case 2:
   13407            8 :                     if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
   13408            8 :                         (!mask_pos && (nargs - i) == nargs_constant))
   13409              :                       {
   13410            4 :                         error ("the next to last argument must be an 8-bit immediate");
   13411            4 :                         break;
   13412              :                       }
   13413              :                     /* FALLTHRU */
   13414          128 :                   case 1:
   13415          128 :                     error ("the last argument must be an 8-bit immediate");
   13416          128 :                     break;
   13417            0 :                   default:
   13418            0 :                     gcc_unreachable ();
   13419              :                   }
   13420          132 :                 return const0_rtx;
   13421              :               }
   13422              :         }
   13423              :       else
   13424              :         {
   13425       181175 :           if (VECTOR_MODE_P (mode))
   13426       130554 :             op = safe_vector_operand (op, mode);
   13427              : 
   13428              :           /* If we aren't optimizing, only allow one memory operand to
   13429              :              be generated.  */
   13430       181175 :           if (memory_operand (op, mode))
   13431              :             {
   13432        29863 :               num_memory++;
   13433        29863 :               if (!optimize && num_memory > 1)
   13434        13602 :                 op = copy_to_mode_reg (mode, op);
   13435              :             }
   13436              : 
   13437       181175 :           if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   13438              :             {
   13439       178885 :               if (!match)
   13440        42558 :                 op = copy_to_mode_reg (mode, op);
   13441              :             }
   13442              :           else
   13443              :             {
   13444         2290 :               op = copy_to_reg (op);
   13445         2290 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   13446              :             }
   13447              :         }
   13448              : 
   13449       200297 :       xops[i] = op;
   13450              :     }
   13451              : 
   13452        60857 :   switch (nargs)
   13453              :     {
   13454         4763 :     case 1:
   13455         4763 :       pat = GEN_FCN (icode) (real_target, xops[0]);
   13456         4763 :       break;
   13457         5663 :     case 2:
   13458         5663 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
   13459         5663 :       break;
   13460        20627 :     case 3:
   13461        20627 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
   13462        20627 :       break;
   13463        27064 :     case 4:
   13464        27064 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13465        27064 :                              xops[2], xops[3]);
   13466        27064 :       break;
   13467         2740 :     case 5:
   13468         2740 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13469         2740 :                              xops[2], xops[3], xops[4]);
   13470         2740 :       break;
   13471              :     case 6:
   13472              :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13473              :                              xops[2], xops[3], xops[4], xops[5]);
   13474              :       break;
   13475              :     default:
   13476              :       gcc_unreachable ();
   13477              :     }
   13478              : 
   13479        60857 :   if (! pat)
   13480              :     return 0;
   13481              : 
   13482        60857 :   emit_insn (pat);
   13483        60857 :   return target;
   13484              : }
   13485              : 
   13486              : /* Transform pattern of following layout:
   13487              :      (set A
   13488              :        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
   13489              :      )
   13490              :    into:
   13491              :      (set (A B)) */
   13492              : 
   13493              : static rtx
   13494         4944 : ix86_erase_embedded_rounding (rtx pat)
   13495              : {
   13496         4944 :   if (NONJUMP_INSN_P (pat))
   13497          694 :     pat = PATTERN (pat);
   13498              : 
   13499         4944 :   gcc_assert (GET_CODE (pat) == SET);
   13500         4944 :   rtx src = SET_SRC (pat);
   13501         4944 :   gcc_assert (XVECLEN (src, 0) == 2);
   13502         4944 :   rtx p0 = XVECEXP (src, 0, 0);
   13503         4944 :   gcc_assert (GET_CODE (src) == UNSPEC
   13504              :               && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
   13505         4944 :   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
   13506         4944 :   return res;
   13507              : }
   13508              : 
   13509              : /* Subroutine of ix86_expand_round_builtin to take care of comi insns
   13510              :    with rounding.  */
   13511              : static rtx
   13512          103 : ix86_expand_sse_comi_round (const struct builtin_description *d,
   13513              :                             tree exp, rtx target, bool comx_ok)
   13514              : {
   13515          103 :   rtx pat, set_dst;
   13516          103 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   13517          103 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   13518          103 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   13519          103 :   tree arg3 = CALL_EXPR_ARG (exp, 3);
   13520          103 :   rtx op0 = expand_normal (arg0);
   13521          103 :   rtx op1 = expand_normal (arg1);
   13522          103 :   rtx op2 = expand_normal (arg2);
   13523          103 :   rtx op3 = expand_normal (arg3);
   13524          103 :   enum insn_code icode = d->icode;
   13525          103 :   const struct insn_data_d *insn_p = &insn_data[icode];
   13526          103 :   machine_mode mode0 = insn_p->operand[0].mode;
   13527          103 :   machine_mode mode1 = insn_p->operand[1].mode;
   13528              : 
   13529              :   /* See avxintrin.h for values.  */
   13530          103 :   static const enum rtx_code comparisons[32] =
   13531              :     {
   13532              :       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
   13533              :       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
   13534              :       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
   13535              :       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
   13536              :     };
   13537          103 :   static const bool ordereds[32] =
   13538              :     {
   13539              :       true,  true,  true,  false, false, false, false, true,
   13540              :       false, false, false, true,  true,  true,  true,  false,
   13541              :       true,  true,  true,  false, false, false, false, true,
   13542              :       false, false, false, true,  true,  true,  true,  false
   13543              :     };
   13544          103 :   static const bool non_signalings[32] =
   13545              :     {
   13546              :       true,  false, false, true,  true,  false, false, true,
   13547              :       true,  false, false, true,  true,  false, false, true,
   13548              :       false, true,  true,  false, false, true,  true,  false,
   13549              :       false, true,  true,  false, false, true,  true,  false
   13550              :     };
   13551              : 
   13552          103 :   if (!CONST_INT_P (op2))
   13553              :     {
   13554            0 :       error ("the third argument must be comparison constant");
   13555            0 :       return const0_rtx;
   13556              :     }
   13557          103 :   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
   13558              :     {
   13559            0 :       error ("incorrect comparison mode");
   13560            0 :       return const0_rtx;
   13561              :     }
   13562              : 
   13563          103 :   if (!insn_p->operand[2].predicate (op3, SImode))
   13564              :     {
   13565            4 :       error ("incorrect rounding operand");
   13566            4 :       return const0_rtx;
   13567              :     }
   13568              : 
   13569           99 :   if (VECTOR_MODE_P (mode0))
   13570           99 :     op0 = safe_vector_operand (op0, mode0);
   13571           99 :   if (VECTOR_MODE_P (mode1))
   13572           99 :     op1 = safe_vector_operand (op1, mode1);
   13573              : 
   13574           99 :   enum rtx_code comparison = comparisons[INTVAL (op2)];
   13575           99 :   enum rtx_code orig_comp = comparison;
   13576           99 :   bool ordered = ordereds[INTVAL (op2)];
   13577           99 :   bool non_signaling = non_signalings[INTVAL (op2)];
   13578           99 :   rtx const_val = const0_rtx;
   13579              : 
   13580           99 :   bool check_unordered = false;
   13581           99 :   machine_mode mode = CCFPmode;
   13582           99 :   switch (comparison)
   13583              :     {
   13584            8 :     case ORDERED:
   13585            8 :       if (!ordered)
   13586              :         {
   13587            4 :           if (TARGET_AVX10_2 && comx_ok)
   13588              :             {
   13589              :               /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
   13590              :                  differently. So directly return true here.  */
   13591            0 :               target = gen_reg_rtx (SImode);
   13592            0 :               emit_move_insn (target, const1_rtx);
   13593            0 :               return target;
   13594              :             }
   13595              :           else
   13596              :             {
   13597              :               /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
   13598              :               if (!non_signaling)
   13599           99 :                 ordered = true;
   13600           99 :               mode = CCSmode;
   13601              :             }
   13602              :         }
   13603              :       else
   13604              :         {
   13605              :           /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
   13606              :           if (non_signaling)
   13607              :             ordered = false;
   13608              :           mode = CCPmode;
   13609              :         }
   13610              :       comparison = NE;
   13611              :       break;
   13612            8 :     case UNORDERED:
   13613            8 :       if (ordered)
   13614              :         {
   13615            4 :           if (TARGET_AVX10_2 && comx_ok)
   13616              :             {
   13617              :               /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
   13618              :                  differently. So directly return false here.  */
   13619            0 :               target = gen_reg_rtx (SImode);
   13620            0 :               emit_move_insn (target, const0_rtx);
   13621            0 :               return target;
   13622              :             }
   13623              :           else
   13624              :             {
   13625              :               /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
   13626              :               if (non_signaling)
   13627           99 :                 ordered = false;
   13628              :               mode = CCSmode;
   13629              :             }
   13630              :         }
   13631              :       else
   13632              :         {
   13633              :           /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
   13634              :           if (!non_signaling)
   13635           99 :             ordered = true;
   13636           99 :           mode = CCPmode;
   13637              :         }
   13638              :       comparison = EQ;
   13639              :       break;
   13640              : 
   13641           40 :     case LE:    /* -> GE  */
   13642           40 :     case LT:    /* -> GT  */
   13643           40 :     case UNGE:  /* -> UNLE  */
   13644           40 :     case UNGT:  /* -> UNLT  */
   13645           40 :       std::swap (op0, op1);
   13646           40 :       comparison = swap_condition (comparison);
   13647              :       /* FALLTHRU */
   13648           68 :     case GT:
   13649           68 :     case GE:
   13650           68 :     case UNEQ:
   13651           68 :     case UNLT:
   13652           68 :     case UNLE:
   13653           68 :     case LTGT:
   13654              :       /* These are supported by CCFPmode.  NB: Use ordered/signaling
   13655              :          COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
   13656              :          with NAN operands.  */
   13657           68 :       if (ordered == non_signaling)
   13658              :         ordered = !ordered;
   13659              :       break;
   13660              :       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
   13661              :          _CMP_EQ_OQ/_CMP_EQ_OS.
   13662              :          Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
   13663              :          of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN.  */
   13664            8 :     case EQ:
   13665            8 :       if (!TARGET_AVX10_2 || !comx_ok)
   13666            5 :         check_unordered = true;
   13667              :       mode = CCZmode;
   13668              :       break;
   13669            7 :     case NE:
   13670              :       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
   13671              :          _CMP_NEQ_UQ/_CMP_NEQ_US.
   13672              :          Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
   13673              :          of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN.  */
   13674            7 :       gcc_assert (!ordered);
   13675            7 :       if (!TARGET_AVX10_2 || !comx_ok)
   13676            4 :         check_unordered = true;
   13677            7 :       mode = CCZmode;
   13678            7 :       const_val = const1_rtx;
   13679            7 :       break;
   13680            0 :     default:
   13681            0 :       gcc_unreachable ();
   13682              :     }
   13683              : 
   13684           99 :   target = gen_reg_rtx (SImode);
   13685           99 :   emit_move_insn (target, const_val);
   13686           99 :   target = gen_rtx_SUBREG (QImode, target, 0);
   13687              : 
   13688           93 :   if ((optimize && !register_operand (op0, mode0))
   13689          192 :       || !insn_p->operand[0].predicate (op0, mode0))
   13690            6 :     op0 = copy_to_mode_reg (mode0, op0);
   13691           93 :   if ((optimize && !register_operand (op1, mode1))
   13692          192 :       || !insn_p->operand[1].predicate (op1, mode1))
   13693            6 :     op1 = copy_to_mode_reg (mode1, op1);
   13694              : 
   13695              :     /* Generate comx instead of comi when EQ/NE to avoid NAN checks.
   13696              :        Use orig_comp to exclude ORDERED/UNORDERED cases.  */
   13697           99 :   if ((orig_comp == EQ || orig_comp == NE)
   13698           15 :       && TARGET_AVX10_2 && comx_ok)
   13699              :     {
   13700            6 :       switch (icode)
   13701              :         {
   13702              :         case CODE_FOR_avx512fp16_comi_round:
   13703           99 :           icode = CODE_FOR_avx10_2_comxhf_round;
   13704              :           break;
   13705            4 :         case CODE_FOR_sse_comi_round:
   13706            4 :           icode = CODE_FOR_avx10_2_comxsf_round;
   13707            4 :           break;
   13708            2 :         case CODE_FOR_sse2_comi_round:
   13709            2 :           icode = CODE_FOR_avx10_2_comxdf_round;
   13710            2 :           break;
   13711              : 
   13712              :         default:
   13713              :           break;
   13714              :         }
   13715              :     }
   13716              : 
   13717              :   /* Generate comi instead of comx when UNEQ/LTGT to avoid NAN checks.  */
   13718           99 :   if ((comparison == UNEQ || comparison == LTGT)
   13719            8 :        && TARGET_AVX10_2 && comx_ok)
   13720              :     {
   13721            0 :       switch (icode)
   13722              :         {
   13723              :         case CODE_FOR_avx10_2_comxhf_round:
   13724           99 :           icode = CODE_FOR_avx512fp16_comi_round;
   13725              :           break;
   13726            0 :         case CODE_FOR_avx10_2_comxsf_round:
   13727            0 :           icode = CODE_FOR_sse_comi_round;
   13728            0 :           break;
   13729            0 :         case CODE_FOR_avx10_2_comxdf_round:
   13730            0 :           icode = CODE_FOR_sse2_comi_round;
   13731            0 :           break;
   13732              : 
   13733              :         default:
   13734              :           break;
   13735              :         }
   13736              :     }
   13737              : 
   13738              :   /*
   13739              :      1. COMI/VCOMX: ordered and signaling.
   13740              :      2. UCOMI/VUCOMX: unordered and non-signaling.
   13741              :    */
   13742           99 :   if (non_signaling)
   13743           38 :     switch (icode)
   13744              :       {
   13745              :       case CODE_FOR_sse_comi_round:
   13746              :         icode = CODE_FOR_sse_ucomi_round;
   13747              :         break;
   13748           17 :       case CODE_FOR_sse2_comi_round:
   13749           17 :         icode = CODE_FOR_sse2_ucomi_round;
   13750           17 :         break;
   13751            0 :       case CODE_FOR_avx512fp16_comi_round:
   13752            0 :         icode = CODE_FOR_avx512fp16_ucomi_round;
   13753            0 :         break;
   13754            3 :       case CODE_FOR_avx10_2_comxsf_round:
   13755            3 :         icode = CODE_FOR_avx10_2_ucomxsf_round;
   13756            3 :         break;
   13757            0 :       case CODE_FOR_avx10_2_comxhf_round:
   13758            0 :         icode = CODE_FOR_avx10_2_ucomxhf_round;
   13759            0 :         break;
   13760            1 :       case CODE_FOR_avx10_2_comxdf_round:
   13761            1 :         icode = CODE_FOR_avx10_2_ucomxdf_round;
   13762            1 :         break;
   13763            0 :       default:
   13764            0 :         gcc_unreachable ();
   13765              :       }
   13766              : 
   13767           99 :   pat = GEN_FCN (icode) (op0, op1, op3);
   13768           99 :   if (! pat)
   13769              :     return 0;
   13770              : 
   13771              :   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
   13772           99 :   if (INTVAL (op3) == NO_ROUND)
   13773              :     {
   13774            1 :       pat = ix86_erase_embedded_rounding (pat);
   13775            1 :       if (! pat)
   13776              :         return 0;
   13777              : 
   13778            1 :       set_dst = SET_DEST (pat);
   13779              :     }
   13780              :   else
   13781              :     {
   13782           98 :       gcc_assert (GET_CODE (pat) == SET);
   13783           98 :       set_dst = SET_DEST (pat);
   13784              :     }
   13785              : 
   13786           99 :   emit_insn (pat);
   13787              : 
   13788           99 :   return ix86_ssecom_setcc (comparison, check_unordered, mode,
   13789           99 :                             set_dst, target);
   13790              : }
   13791              : 
   13792              : static rtx
   13793        15589 : ix86_expand_round_builtin (const struct builtin_description *d,
   13794              :                            tree exp, rtx target)
   13795              : {
   13796        15589 :   rtx pat;
   13797        15589 :   unsigned int i, nargs;
   13798        15589 :   rtx xops[6];
   13799        15589 :   enum insn_code icode = d->icode;
   13800        15589 :   const struct insn_data_d *insn_p = &insn_data[icode];
   13801        15589 :   machine_mode tmode = insn_p->operand[0].mode;
   13802        15589 :   unsigned int nargs_constant = 0;
   13803        15589 :   unsigned int redundant_embed_rnd = 0;
   13804              : 
   13805        15589 :   switch ((enum ix86_builtin_func_type) d->flag)
   13806              :     {
   13807              :     case UINT64_FTYPE_V2DF_INT:
   13808              :     case UINT64_FTYPE_V4SF_INT:
   13809              :     case UINT64_FTYPE_V8HF_INT:
   13810              :     case UINT_FTYPE_V2DF_INT:
   13811              :     case UINT_FTYPE_V4SF_INT:
   13812              :     case UINT_FTYPE_V8HF_INT:
   13813              :     case INT64_FTYPE_V2DF_INT:
   13814              :     case INT64_FTYPE_V4SF_INT:
   13815              :     case INT64_FTYPE_V8HF_INT:
   13816              :     case INT_FTYPE_V2DF_INT:
   13817              :     case INT_FTYPE_V4SF_INT:
   13818              :     case INT_FTYPE_V8HF_INT:
   13819              :       nargs = 2;
   13820              :       break;
   13821          651 :     case V32HF_FTYPE_V32HF_V32HF_INT:
   13822          651 :     case V8HF_FTYPE_V8HF_V8HF_INT:
   13823          651 :     case V8HF_FTYPE_V8HF_INT_INT:
   13824          651 :     case V8HF_FTYPE_V8HF_UINT_INT:
   13825          651 :     case V8HF_FTYPE_V8HF_INT64_INT:
   13826          651 :     case V8HF_FTYPE_V8HF_UINT64_INT:
   13827          651 :     case V4SF_FTYPE_V4SF_UINT_INT:
   13828          651 :     case V4SF_FTYPE_V4SF_UINT64_INT:
   13829          651 :     case V2DF_FTYPE_V2DF_UINT64_INT:
   13830          651 :     case V4SF_FTYPE_V4SF_INT_INT:
   13831          651 :     case V4SF_FTYPE_V4SF_INT64_INT:
   13832          651 :     case V2DF_FTYPE_V2DF_INT64_INT:
   13833          651 :     case V4SF_FTYPE_V4SF_V4SF_INT:
   13834          651 :     case V2DF_FTYPE_V2DF_V2DF_INT:
   13835          651 :     case V4SF_FTYPE_V4SF_V2DF_INT:
   13836          651 :     case V2DF_FTYPE_V2DF_V4SF_INT:
   13837          651 :       nargs = 3;
   13838          651 :       break;
   13839         4554 :     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
   13840         4554 :     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
   13841         4554 :     case V32HI_FTYPE_V32HF_V32HI_USI_INT:
   13842         4554 :     case V32HI_FTYPE_V32BF_V32HI_USI_INT:
   13843         4554 :     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
   13844         4554 :     case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
   13845         4554 :     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
   13846         4554 :     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
   13847         4554 :     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
   13848         4554 :     case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
   13849         4554 :     case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
   13850         4554 :     case V32HF_FTYPE_V32HI_V32HF_USI_INT:
   13851         4554 :     case V32HF_FTYPE_V32HF_V32HF_USI_INT:
   13852         4554 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
   13853         4554 :     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
   13854         4554 :     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
   13855         4554 :     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
   13856         4554 :     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
   13857         4554 :     case V16SI_FTYPE_V16SF_V16SI_UHI_INT:
   13858         4554 :     case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
   13859         4554 :     case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
   13860         4554 :     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
   13861         4554 :     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
   13862         4554 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
   13863         4554 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
   13864         4554 :     case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
   13865         4554 :     case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
   13866         4554 :     case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
   13867         4554 :     case V16HI_FTYPE_V16BF_V16HI_UHI_INT:
   13868         4554 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
   13869         4554 :       nargs = 4;
   13870         4554 :       break;
   13871          180 :     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
   13872          180 :     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
   13873          180 :       nargs_constant = 2;
   13874          180 :       nargs = 4;
   13875          180 :       break;
   13876          103 :     case INT_FTYPE_V4SF_V4SF_INT_INT:
   13877          103 :     case INT_FTYPE_V2DF_V2DF_INT_INT:
   13878          103 :       return ix86_expand_sse_comi_round (d, exp, target, true);
   13879         6233 :     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
   13880         6233 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
   13881         6233 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
   13882         6233 :     case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
   13883         6233 :     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
   13884         6233 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
   13885         6233 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
   13886         6233 :     case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
   13887         6233 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
   13888         6233 :     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
   13889         6233 :     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
   13890         6233 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
   13891         6233 :     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
   13892         6233 :     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
   13893         6233 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
   13894         6233 :     case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
   13895         6233 :     case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
   13896         6233 :     case V32HF_FTYPE_V16SF_V16SF_V32HF_USI_INT:
   13897         6233 :       nargs = 5;
   13898         6233 :       break;
   13899          635 :     case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
   13900          635 :     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
   13901          635 :     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
   13902          635 :     case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
   13903          635 :     case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
   13904          635 :       nargs_constant = 4;
   13905          635 :       nargs = 5;
   13906          635 :       break;
   13907         1181 :     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
   13908         1181 :     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
   13909         1181 :     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
   13910         1181 :     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
   13911         1181 :     case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
   13912         1181 :     case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
   13913         1181 :       nargs_constant = 3;
   13914         1181 :       nargs = 5;
   13915         1181 :       break;
   13916         1071 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
   13917         1071 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
   13918         1071 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
   13919         1071 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
   13920         1071 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
   13921         1071 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
   13922         1071 :     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
   13923         1071 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI_INT:
   13924         1071 :     case V32HF_FTYPE_V32HF_V32HF_INT_V32HF_USI_INT:
   13925         1071 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI_INT:
   13926         1071 :       nargs = 6;
   13927         1071 :       nargs_constant = 4;
   13928         1071 :       break;
   13929          252 :     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
   13930          252 :     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
   13931          252 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
   13932          252 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
   13933          252 :       nargs = 6;
   13934          252 :       nargs_constant = 3;
   13935          252 :       break;
   13936            0 :     default:
   13937            0 :       gcc_unreachable ();
   13938              :     }
   13939        14757 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   13940              : 
   13941        15486 :   if (optimize
   13942         4265 :       || target == 0
   13943         4265 :       || GET_MODE (target) != tmode
   13944        19751 :       || !insn_p->operand[0].predicate (target, tmode))
   13945        11221 :     target = gen_reg_rtx (tmode);
   13946              : 
   13947        85365 :   for (i = 0; i < nargs; i++)
   13948              :     {
   13949        70434 :       tree arg = CALL_EXPR_ARG (exp, i);
   13950        70434 :       rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
   13951        70434 :       machine_mode mode = insn_p->operand[i + 1].mode;
   13952        70434 :       bool match = insn_p->operand[i + 1].predicate (op, mode);
   13953              : 
   13954        70434 :       if (i == nargs - nargs_constant)
   13955              :         {
   13956         3319 :           if (!match)
   13957              :             {
   13958           40 :               switch (icode)
   13959              :                 {
   13960           12 :                 case CODE_FOR_avx512f_getmantv8df_mask_round:
   13961           12 :                 case CODE_FOR_avx512f_getmantv16sf_mask_round:
   13962           12 :                 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
   13963           12 :                 case CODE_FOR_avx512f_vgetmantv2df_round:
   13964           12 :                 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
   13965           12 :                 case CODE_FOR_avx512f_vgetmantv4sf_round:
   13966           12 :                 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
   13967           12 :                 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
   13968           12 :                   error ("the immediate argument must be a 4-bit immediate");
   13969           12 :                   return const0_rtx;
   13970            8 :                 case CODE_FOR_avx512f_cmpv8df3_mask_round:
   13971            8 :                 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
   13972            8 :                 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
   13973            8 :                 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
   13974            8 :                 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
   13975            8 :                 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
   13976            8 :                   error ("the immediate argument must be a 5-bit immediate");
   13977            8 :                   return const0_rtx;
   13978           20 :                 default:
   13979           20 :                   error ("the immediate argument must be an 8-bit immediate");
   13980           20 :                   return const0_rtx;
   13981              :                 }
   13982              :             }
   13983              :         }
   13984        67115 :       else if (i == nargs-1)
   13985              :         {
   13986        15446 :           if (!insn_p->operand[nargs].predicate (op, SImode))
   13987              :             {
   13988          515 :               error ("incorrect rounding operand");
   13989          515 :               return const0_rtx;
   13990              :             }
   13991              : 
   13992              :           /* If there is no rounding use normal version of the pattern.  */
   13993        14931 :           if (INTVAL (op) == NO_ROUND)
   13994              :             {
   13995              :               /* Skip erasing embedded rounding for below expanders who
   13996              :                  generates multiple insns.  In ix86_erase_embedded_rounding
   13997              :                  the pattern will be transformed to a single set, and emit_insn
   13998              :                  appends the set instead of insert it to chain.  So the insns
   13999              :                  emitted inside define_expander would be ignored.  */
   14000         4975 :               switch (icode)
   14001              :                 {
   14002              :                 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
   14003              :                 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
   14004              :                 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
   14005              :                 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
   14006              :                 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
   14007              :                 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
   14008              :                   redundant_embed_rnd = 0;
   14009              :                   break;
   14010         4943 :                 default:
   14011         4943 :                   redundant_embed_rnd = 1;
   14012         4943 :                   break;
   14013              :                 }
   14014              :             }
   14015              :         }
   14016              :       else
   14017              :         {
   14018        51669 :           if (VECTOR_MODE_P (mode))
   14019        37752 :             op = safe_vector_operand (op, mode);
   14020              : 
   14021        51669 :           op = fixup_modeless_constant (op, mode);
   14022              : 
   14023        51669 :           if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   14024              :             {
   14025        51669 :               if (optimize || !match)
   14026        45341 :                 op = copy_to_mode_reg (mode, op);
   14027              :             }
   14028              :           else
   14029              :             {
   14030            0 :               op = copy_to_reg (op);
   14031            0 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   14032              :             }
   14033              :         }
   14034              : 
   14035        69879 :       xops[i] = op;
   14036              :     }
   14037              : 
   14038        14931 :   switch (nargs)
   14039              :     {
   14040              :     case 1:
   14041              :       pat = GEN_FCN (icode) (target, xops[0]);
   14042              :       break;
   14043          696 :     case 2:
   14044          696 :       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   14045          696 :       break;
   14046          607 :     case 3:
   14047          607 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   14048          607 :       break;
   14049         4610 :     case 4:
   14050         4610 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   14051         4610 :                              xops[2], xops[3]);
   14052         4610 :       break;
   14053         7745 :     case 5:
   14054         7745 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   14055         7745 :                              xops[2], xops[3], xops[4]);
   14056         7745 :       break;
   14057         1273 :     case 6:
   14058         1273 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   14059         1273 :                              xops[2], xops[3], xops[4], xops[5]);
   14060         1273 :       break;
   14061              :     default:
   14062              :       gcc_unreachable ();
   14063              :     }
   14064              : 
   14065        14931 :   if (!pat)
   14066              :     return 0;
   14067              : 
   14068        14931 :   if (redundant_embed_rnd)
   14069         4943 :     pat = ix86_erase_embedded_rounding (pat);
   14070              : 
   14071        14931 :   emit_insn (pat);
   14072        14931 :   return target;
   14073              : }
   14074              : 
   14075              : /* Subroutine of ix86_expand_builtin to take care of special insns
   14076              :    with variable number of operands.  */
   14077              : 
   14078              : static rtx
   14079        27184 : ix86_expand_special_args_builtin (const struct builtin_description *d,
   14080              :                                   tree exp, rtx target)
   14081              : {
   14082        27184 :   tree arg;
   14083        27184 :   rtx pat, op;
   14084        27184 :   unsigned int i, nargs, arg_adjust, memory;
   14085        27184 :   unsigned int constant = 100;
   14086        27184 :   bool aligned_mem = false;
   14087        27184 :   rtx xops[4];
   14088        27184 :   enum insn_code icode = d->icode;
   14089        27184 :   const struct insn_data_d *insn_p = &insn_data[icode];
   14090        27184 :   machine_mode tmode = insn_p->operand[0].mode;
   14091        27184 :   enum { load, store } klass;
   14092              : 
   14093        27184 :   switch ((enum ix86_builtin_func_type) d->flag)
   14094              :     {
   14095        15371 :     case VOID_FTYPE_VOID:
   14096        15371 :       emit_insn (GEN_FCN (icode) (target));
   14097        15371 :       return 0;
   14098              :     case VOID_FTYPE_UINT64:
   14099              :     case VOID_FTYPE_UNSIGNED:
   14100              :       nargs = 0;
   14101              :       klass = store;
   14102              :       memory = 0;
   14103              :       break;
   14104              : 
   14105         7581 :     case INT_FTYPE_VOID:
   14106         7581 :     case USHORT_FTYPE_VOID:
   14107         7581 :     case UINT64_FTYPE_VOID:
   14108         7581 :     case UINT_FTYPE_VOID:
   14109         7581 :     case UINT8_FTYPE_VOID:
   14110         7581 :     case UNSIGNED_FTYPE_VOID:
   14111         7581 :       nargs = 0;
   14112         7581 :       klass = load;
   14113         7581 :       memory = 0;
   14114         7581 :       break;
   14115          359 :     case CHAR_FTYPE_PCCHAR:
   14116          359 :     case SHORT_FTYPE_PCSHORT:
   14117          359 :     case INT_FTYPE_PCINT:
   14118          359 :     case INT64_FTYPE_PCINT64:
   14119          359 :     case UINT64_FTYPE_PUNSIGNED:
   14120          359 :     case V2DI_FTYPE_PV2DI:
   14121          359 :     case V4DI_FTYPE_PV4DI:
   14122          359 :     case V32QI_FTYPE_PCCHAR:
   14123          359 :     case V16QI_FTYPE_PCCHAR:
   14124          359 :     case V8SF_FTYPE_PCV4SF:
   14125          359 :     case V8SF_FTYPE_PCFLOAT:
   14126          359 :     case V4SF_FTYPE_PCFLOAT:
   14127          359 :     case V4SF_FTYPE_PCFLOAT16:
   14128          359 :     case V4SF_FTYPE_PCBFLOAT16:
   14129          359 :     case V4SF_FTYPE_PCV8BF:
   14130          359 :     case V4SF_FTYPE_PCV8HF:
   14131          359 :     case V8SF_FTYPE_PCFLOAT16:
   14132          359 :     case V8SF_FTYPE_PCBFLOAT16:
   14133          359 :     case V8SF_FTYPE_PCV16HF:
   14134          359 :     case V8SF_FTYPE_PCV16BF:
   14135          359 :     case V4DF_FTYPE_PCV2DF:
   14136          359 :     case V4DF_FTYPE_PCDOUBLE:
   14137          359 :     case V2DF_FTYPE_PCDOUBLE:
   14138          359 :     case VOID_FTYPE_PVOID:
   14139          359 :     case V8DI_FTYPE_PV8DI:
   14140          359 :       nargs = 1;
   14141          359 :       klass = load;
   14142          359 :       memory = 0;
   14143          359 :       switch (icode)
   14144              :         {
   14145              :         case CODE_FOR_sse4_1_movntdqa:
   14146              :         case CODE_FOR_avx2_movntdqa:
   14147              :         case CODE_FOR_avx512f_movntdqa:
   14148              :           aligned_mem = true;
   14149              :           break;
   14150              :         default:
   14151              :           break;
   14152              :         }
   14153              :       break;
   14154          371 :     case VOID_FTYPE_PV2SF_V4SF:
   14155          371 :     case VOID_FTYPE_PV8DI_V8DI:
   14156          371 :     case VOID_FTYPE_PV4DI_V4DI:
   14157          371 :     case VOID_FTYPE_PV2DI_V2DI:
   14158          371 :     case VOID_FTYPE_PCHAR_V32QI:
   14159          371 :     case VOID_FTYPE_PCHAR_V16QI:
   14160          371 :     case VOID_FTYPE_PFLOAT_V16SF:
   14161          371 :     case VOID_FTYPE_PFLOAT_V8SF:
   14162          371 :     case VOID_FTYPE_PFLOAT_V4SF:
   14163          371 :     case VOID_FTYPE_PDOUBLE_V8DF:
   14164          371 :     case VOID_FTYPE_PDOUBLE_V4DF:
   14165          371 :     case VOID_FTYPE_PDOUBLE_V2DF:
   14166          371 :     case VOID_FTYPE_PLONGLONG_LONGLONG:
   14167          371 :     case VOID_FTYPE_PULONGLONG_ULONGLONG:
   14168          371 :     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
   14169          371 :     case VOID_FTYPE_PINT_INT:
   14170          371 :       nargs = 1;
   14171          371 :       klass = store;
   14172              :       /* Reserve memory operand for target.  */
   14173          371 :       memory = ARRAY_SIZE (xops);
   14174          371 :       switch (icode)
   14175              :         {
   14176              :         /* These builtins and instructions require the memory
   14177              :            to be properly aligned.  */
   14178              :         case CODE_FOR_avx_movntv4di:
   14179              :         case CODE_FOR_sse2_movntv2di:
   14180              :         case CODE_FOR_avx_movntv8sf:
   14181              :         case CODE_FOR_sse_movntv4sf:
   14182              :         case CODE_FOR_sse4a_vmmovntv4sf:
   14183              :         case CODE_FOR_avx_movntv4df:
   14184              :         case CODE_FOR_sse2_movntv2df:
   14185              :         case CODE_FOR_sse4a_vmmovntv2df:
   14186              :         case CODE_FOR_sse2_movntidi:
   14187              :         case CODE_FOR_sse_movntq:
   14188              :         case CODE_FOR_sse2_movntisi:
   14189              :         case CODE_FOR_avx512f_movntv16sf:
   14190              :         case CODE_FOR_avx512f_movntv8df:
   14191              :         case CODE_FOR_avx512f_movntv8di:
   14192              :           aligned_mem = true;
   14193              :           break;
   14194              :         default:
   14195              :           break;
   14196              :         }
   14197              :       break;
   14198            0 :     case VOID_FTYPE_PVOID_PCVOID:
   14199            0 :         nargs = 1;
   14200            0 :         klass = store;
   14201            0 :         memory = 0;
   14202              : 
   14203            0 :         break;
   14204           26 :     case V4SF_FTYPE_V4SF_PCV2SF:
   14205           26 :     case V2DF_FTYPE_V2DF_PCDOUBLE:
   14206           26 :       nargs = 2;
   14207           26 :       klass = load;
   14208           26 :       memory = 1;
   14209           26 :       break;
   14210           93 :     case V8SF_FTYPE_PCV8SF_V8SI:
   14211           93 :     case V4DF_FTYPE_PCV4DF_V4DI:
   14212           93 :     case V4SF_FTYPE_PCV4SF_V4SI:
   14213           93 :     case V2DF_FTYPE_PCV2DF_V2DI:
   14214           93 :     case V8SI_FTYPE_PCV8SI_V8SI:
   14215           93 :     case V4DI_FTYPE_PCV4DI_V4DI:
   14216           93 :     case V4SI_FTYPE_PCV4SI_V4SI:
   14217           93 :     case V2DI_FTYPE_PCV2DI_V2DI:
   14218           93 :     case VOID_FTYPE_INT_INT64:
   14219           93 :       nargs = 2;
   14220           93 :       klass = load;
   14221           93 :       memory = 0;
   14222           93 :       break;
   14223          360 :     case VOID_FTYPE_PV8DF_V8DF_UQI:
   14224          360 :     case VOID_FTYPE_PV4DF_V4DF_UQI:
   14225          360 :     case VOID_FTYPE_PV2DF_V2DF_UQI:
   14226          360 :     case VOID_FTYPE_PV16SF_V16SF_UHI:
   14227          360 :     case VOID_FTYPE_PV8SF_V8SF_UQI:
   14228          360 :     case VOID_FTYPE_PV4SF_V4SF_UQI:
   14229          360 :     case VOID_FTYPE_PV8DI_V8DI_UQI:
   14230          360 :     case VOID_FTYPE_PV4DI_V4DI_UQI:
   14231          360 :     case VOID_FTYPE_PV2DI_V2DI_UQI:
   14232          360 :     case VOID_FTYPE_PV16SI_V16SI_UHI:
   14233          360 :     case VOID_FTYPE_PV8SI_V8SI_UQI:
   14234          360 :     case VOID_FTYPE_PV4SI_V4SI_UQI:
   14235          360 :     case VOID_FTYPE_PV64QI_V64QI_UDI:
   14236          360 :     case VOID_FTYPE_PV32HI_V32HI_USI:
   14237          360 :     case VOID_FTYPE_PV32QI_V32QI_USI:
   14238          360 :     case VOID_FTYPE_PV16QI_V16QI_UHI:
   14239          360 :     case VOID_FTYPE_PV16HI_V16HI_UHI:
   14240          360 :     case VOID_FTYPE_PV8HI_V8HI_UQI:
   14241          360 :       switch (icode)
   14242              :         {
   14243              :         /* These builtins and instructions require the memory
   14244              :            to be properly aligned.  */
   14245              :         case CODE_FOR_avx512f_storev16sf_mask:
   14246              :         case CODE_FOR_avx512f_storev16si_mask:
   14247              :         case CODE_FOR_avx512f_storev8df_mask:
   14248              :         case CODE_FOR_avx512f_storev8di_mask:
   14249              :         case CODE_FOR_avx512vl_storev8sf_mask:
   14250              :         case CODE_FOR_avx512vl_storev8si_mask:
   14251              :         case CODE_FOR_avx512vl_storev4df_mask:
   14252              :         case CODE_FOR_avx512vl_storev4di_mask:
   14253              :         case CODE_FOR_avx512vl_storev4sf_mask:
   14254              :         case CODE_FOR_avx512vl_storev4si_mask:
   14255              :         case CODE_FOR_avx512vl_storev2df_mask:
   14256              :         case CODE_FOR_avx512vl_storev2di_mask:
   14257        11813 :           aligned_mem = true;
   14258              :           break;
   14259              :         default:
   14260              :           break;
   14261              :         }
   14262              :       /* FALLTHRU */
   14263              :     case VOID_FTYPE_PV8SF_V8SI_V8SF:
   14264              :     case VOID_FTYPE_PV4DF_V4DI_V4DF:
   14265              :     case VOID_FTYPE_PV4SF_V4SI_V4SF:
   14266              :     case VOID_FTYPE_PV2DF_V2DI_V2DF:
   14267              :     case VOID_FTYPE_PV8SI_V8SI_V8SI:
   14268              :     case VOID_FTYPE_PV4DI_V4DI_V4DI:
   14269              :     case VOID_FTYPE_PV4SI_V4SI_V4SI:
   14270              :     case VOID_FTYPE_PV2DI_V2DI_V2DI:
   14271              :     case VOID_FTYPE_PV8SI_V8DI_UQI:
   14272              :     case VOID_FTYPE_PV8HI_V8DI_UQI:
   14273              :     case VOID_FTYPE_PV16HI_V16SI_UHI:
   14274              :     case VOID_FTYPE_PUDI_V8DI_UQI:
   14275              :     case VOID_FTYPE_PV16QI_V16SI_UHI:
   14276              :     case VOID_FTYPE_PV4SI_V4DI_UQI:
   14277              :     case VOID_FTYPE_PUDI_V2DI_UQI:
   14278              :     case VOID_FTYPE_PUDI_V4DI_UQI:
   14279              :     case VOID_FTYPE_PUSI_V2DI_UQI:
   14280              :     case VOID_FTYPE_PV8HI_V8SI_UQI:
   14281              :     case VOID_FTYPE_PUDI_V4SI_UQI:
   14282              :     case VOID_FTYPE_PUSI_V4DI_UQI:
   14283              :     case VOID_FTYPE_PUHI_V2DI_UQI:
   14284              :     case VOID_FTYPE_PUDI_V8SI_UQI:
   14285              :     case VOID_FTYPE_PUSI_V4SI_UQI:
   14286              :     case VOID_FTYPE_PCHAR_V64QI_UDI:
   14287              :     case VOID_FTYPE_PCHAR_V32QI_USI:
   14288              :     case VOID_FTYPE_PCHAR_V16QI_UHI:
   14289              :     case VOID_FTYPE_PSHORT_V32HI_USI:
   14290              :     case VOID_FTYPE_PSHORT_V16HI_UHI:
   14291              :     case VOID_FTYPE_PSHORT_V8HI_UQI:
   14292              :     case VOID_FTYPE_PINT_V16SI_UHI:
   14293              :     case VOID_FTYPE_PINT_V8SI_UQI:
   14294              :     case VOID_FTYPE_PINT_V4SI_UQI:
   14295              :     case VOID_FTYPE_PINT64_V8DI_UQI:
   14296              :     case VOID_FTYPE_PINT64_V4DI_UQI:
   14297              :     case VOID_FTYPE_PINT64_V2DI_UQI:
   14298              :     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
   14299              :     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
   14300              :     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
   14301              :     case VOID_FTYPE_PFLOAT_V16SF_UHI:
   14302              :     case VOID_FTYPE_PFLOAT_V8SF_UQI:
   14303              :     case VOID_FTYPE_PFLOAT_V4SF_UQI:
   14304              :     case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
   14305              :     case VOID_FTYPE_PV32QI_V32HI_USI:
   14306              :     case VOID_FTYPE_PV16QI_V16HI_UHI:
   14307              :     case VOID_FTYPE_PUDI_V8HI_UQI:
   14308              :       nargs = 2;
   14309              :       klass = store;
   14310              :       /* Reserve memory operand for target.  */
   14311              :       memory = ARRAY_SIZE (xops);
   14312              :       break;
   14313         1243 :     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
   14314         1243 :     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
   14315         1243 :     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
   14316         1243 :     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
   14317         1243 :     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
   14318         1243 :     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
   14319         1243 :     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
   14320         1243 :     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
   14321         1243 :     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
   14322         1243 :     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
   14323         1243 :     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
   14324         1243 :     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
   14325         1243 :     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
   14326         1243 :     case V32HI_FTYPE_PCV32HI_V32HI_USI:
   14327         1243 :     case V32QI_FTYPE_PCV32QI_V32QI_USI:
   14328         1243 :     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
   14329         1243 :     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
   14330         1243 :     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
   14331         1243 :       switch (icode)
   14332              :         {
   14333              :         /* These builtins and instructions require the memory
   14334              :            to be properly aligned.  */
   14335              :         case CODE_FOR_avx512f_loadv16sf_mask:
   14336              :         case CODE_FOR_avx512f_loadv16si_mask:
   14337              :         case CODE_FOR_avx512f_loadv8df_mask:
   14338              :         case CODE_FOR_avx512f_loadv8di_mask:
   14339              :         case CODE_FOR_avx512vl_loadv8sf_mask:
   14340              :         case CODE_FOR_avx512vl_loadv8si_mask:
   14341              :         case CODE_FOR_avx512vl_loadv4df_mask:
   14342              :         case CODE_FOR_avx512vl_loadv4di_mask:
   14343              :         case CODE_FOR_avx512vl_loadv4sf_mask:
   14344              :         case CODE_FOR_avx512vl_loadv4si_mask:
   14345              :         case CODE_FOR_avx512vl_loadv2df_mask:
   14346              :         case CODE_FOR_avx512vl_loadv2di_mask:
   14347              :         case CODE_FOR_avx512bw_loadv64qi_mask:
   14348              :         case CODE_FOR_avx512vl_loadv32qi_mask:
   14349              :         case CODE_FOR_avx512vl_loadv16qi_mask:
   14350              :         case CODE_FOR_avx512bw_loadv32hi_mask:
   14351              :         case CODE_FOR_avx512vl_loadv16hi_mask:
   14352              :         case CODE_FOR_avx512vl_loadv8hi_mask:
   14353        11813 :           aligned_mem = true;
   14354              :           break;
   14355              :         default:
   14356              :           break;
   14357              :         }
   14358              :       /* FALLTHRU */
   14359              :     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
   14360              :     case V32QI_FTYPE_PCCHAR_V32QI_USI:
   14361              :     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
   14362              :     case V32HI_FTYPE_PCSHORT_V32HI_USI:
   14363              :     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
   14364              :     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
   14365              :     case V16SI_FTYPE_PCINT_V16SI_UHI:
   14366              :     case V8SI_FTYPE_PCINT_V8SI_UQI:
   14367              :     case V4SI_FTYPE_PCINT_V4SI_UQI:
   14368              :     case V8DI_FTYPE_PCINT64_V8DI_UQI:
   14369              :     case V4DI_FTYPE_PCINT64_V4DI_UQI:
   14370              :     case V2DI_FTYPE_PCINT64_V2DI_UQI:
   14371              :     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
   14372              :     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
   14373              :     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
   14374              :     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
   14375              :     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
   14376              :     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
   14377              :     case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
   14378              :       nargs = 3;
   14379              :       klass = load;
   14380              :       memory = 0;
   14381              :       break;
   14382          105 :     case INT_FTYPE_PINT_INT_INT_INT:
   14383          105 :     case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
   14384          105 :       nargs = 4;
   14385          105 :       klass = load;
   14386          105 :       memory = 0;
   14387          105 :       constant = 3;
   14388          105 :       break;
   14389            0 :     default:
   14390            0 :       gcc_unreachable ();
   14391              :     }
   14392              : 
   14393         8339 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   14394              : 
   14395        11813 :   if (klass == store)
   14396              :     {
   14397         1878 :       arg = CALL_EXPR_ARG (exp, 0);
   14398         1878 :       op = expand_normal (arg);
   14399         1878 :       gcc_assert (target == 0);
   14400         1878 :       if (memory)
   14401              :         {
   14402         1715 :           op = ix86_zero_extend_to_Pmode (op);
   14403         1715 :           target = gen_rtx_MEM (tmode, op);
   14404              :           /* target at this point has just BITS_PER_UNIT MEM_ALIGN
   14405              :              on it.  Try to improve it using get_pointer_alignment,
   14406              :              and if the special builtin is one that requires strict
   14407              :              mode alignment, also from it's GET_MODE_ALIGNMENT.
   14408              :              Failure to do so could lead to ix86_legitimate_combined_insn
   14409              :              rejecting all changes to such insns.  */
   14410         1715 :           unsigned int align = get_pointer_alignment (arg);
   14411         1715 :           if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
   14412          275 :             align = GET_MODE_ALIGNMENT (tmode);
   14413         3430 :           if (MEM_ALIGN (target) < align)
   14414          422 :             set_mem_align (target, align);
   14415              :         }
   14416              :       else
   14417          163 :         target = force_reg (tmode, op);
   14418              :       arg_adjust = 1;
   14419              :     }
   14420              :   else
   14421              :     {
   14422         9935 :       arg_adjust = 0;
   14423         9935 :       if (optimize
   14424         2918 :           || target == 0
   14425         2918 :           || !register_operand (target, tmode)
   14426        12842 :           || GET_MODE (target) != tmode)
   14427         7028 :         target = gen_reg_rtx (tmode);
   14428              :     }
   14429              : 
   14430        21202 :   for (i = 0; i < nargs; i++)
   14431              :     {
   14432         9389 :       machine_mode mode = insn_p->operand[i + 1].mode;
   14433              : 
   14434         9389 :       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
   14435         9389 :       op = ix86_expand_unsigned_small_int_cst_argument (arg);
   14436              : 
   14437         9389 :       if (i == memory)
   14438              :         {
   14439              :           /* This must be the memory operand.  */
   14440         2354 :           op = ix86_zero_extend_to_Pmode (op);
   14441         2354 :           op = gen_rtx_MEM (mode, op);
   14442              :           /* op at this point has just BITS_PER_UNIT MEM_ALIGN
   14443              :              on it.  Try to improve it using get_pointer_alignment,
   14444              :              and if the special builtin is one that requires strict
   14445              :              mode alignment, also from it's GET_MODE_ALIGNMENT.
   14446              :              Failure to do so could lead to ix86_legitimate_combined_insn
   14447              :              rejecting all changes to such insns.  */
   14448         2354 :           unsigned int align = get_pointer_alignment (arg);
   14449         2354 :           if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
   14450          299 :             align = GET_MODE_ALIGNMENT (mode);
   14451         4708 :           if (MEM_ALIGN (op) < align)
   14452          523 :             set_mem_align (op, align);
   14453              :         }
   14454         7035 :       else if (i == constant)
   14455              :         {
   14456              :           /* This must be the constant.  */
   14457          105 :           if (!insn_p->operand[nargs].predicate(op, SImode))
   14458              :             {
   14459            0 :               error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
   14460            0 :               return const0_rtx;
   14461              :             }
   14462              :         }
   14463              :       else
   14464              :         {
   14465              :           /* This must be register.  */
   14466         6930 :           if (VECTOR_MODE_P (mode))
   14467         3475 :             op = safe_vector_operand (op, mode);
   14468              : 
   14469         6930 :           op = fixup_modeless_constant (op, mode);
   14470              : 
   14471              :           /* NB: 3-operands load implied it's a mask load or v{p}expand*,
   14472              :              and that mask operand shoud be at the end.
   14473              :              Keep all-ones mask which would be simplified by the expander.  */
   14474         1771 :           if (nargs == 3 && i == 2 && klass == load
   14475         1771 :               && constm1_operand (op, mode)
   14476         7103 :               && insn_p->operand[i].predicate (op, mode))
   14477              :             ;
   14478         6930 :           else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   14479         6930 :             op = copy_to_mode_reg (mode, op);
   14480              :           else
   14481              :             {
   14482            0 :               op = copy_to_reg (op);
   14483            0 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   14484              :             }
   14485              :         }
   14486              : 
   14487         9389 :       xops[i]= op;
   14488              :     }
   14489              : 
   14490        11813 :   switch (nargs)
   14491              :     {
   14492         7744 :     case 0:
   14493         7744 :       pat = GEN_FCN (icode) (target);
   14494         7744 :       break;
   14495          730 :     case 1:
   14496          730 :       pat = GEN_FCN (icode) (target, xops[0]);
   14497          730 :       break;
   14498         1463 :     case 2:
   14499         1463 :       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   14500         1463 :       break;
   14501         1771 :     case 3:
   14502         1771 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   14503         1771 :       break;
   14504          105 :     case 4:
   14505          105 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
   14506          105 :       break;
   14507              :     default:
   14508              :       gcc_unreachable ();
   14509              :     }
   14510              : 
   14511        11813 :   if (! pat)
   14512              :     return 0;
   14513              : 
   14514        11813 :   emit_insn (pat);
   14515        11813 :   return klass == store ? 0 : target;
   14516              : }
   14517              : 
   14518              : /* Return the integer constant in ARG.  Constrain it to be in the range
   14519              :    of the subparts of VEC_TYPE; issue an error if not.  */
   14520              : 
   14521              : static int
   14522          603 : get_element_number (tree vec_type, tree arg)
   14523              : {
   14524          603 :   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
   14525              : 
   14526          603 :   if (!tree_fits_uhwi_p (arg)
   14527          603 :       || (elt = tree_to_uhwi (arg), elt > max))
   14528              :     {
   14529            0 :       error ("selector must be an integer constant in the range "
   14530              :              "[0, %wi]", max);
   14531            0 :       return 0;
   14532              :     }
   14533              : 
   14534          603 :   return elt;
   14535              : }
   14536              : 
   14537              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14538              :    ix86_expand_vector_init.  We DO have language-level syntax for this, in
   14539              :    the form of  (type){ init-list }.  Except that since we can't place emms
   14540              :    instructions from inside the compiler, we can't allow the use of MMX
   14541              :    registers unless the user explicitly asks for it.  So we do *not* define
   14542              :    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
   14543              :    we have builtins invoked by mmintrin.h that gives us license to emit
   14544              :    these sorts of instructions.  */
   14545              : 
   14546              : static rtx
   14547          229 : ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
   14548              : {
   14549          229 :   machine_mode tmode = TYPE_MODE (type);
   14550          229 :   machine_mode inner_mode = GET_MODE_INNER (tmode);
   14551          229 :   int i, n_elt = GET_MODE_NUNITS (tmode);
   14552          229 :   rtvec v = rtvec_alloc (n_elt);
   14553              : 
   14554          229 :   gcc_assert (VECTOR_MODE_P (tmode));
   14555          229 :   gcc_assert (call_expr_nargs (exp) == n_elt);
   14556              : 
   14557         1203 :   for (i = 0; i < n_elt; ++i)
   14558              :     {
   14559          974 :       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
   14560          974 :       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
   14561              :     }
   14562              : 
   14563          229 :   if (!target || !register_operand (target, tmode))
   14564            0 :     target = gen_reg_rtx (tmode);
   14565              : 
   14566          229 :   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
   14567          229 :   return target;
   14568              : }
   14569              : 
   14570              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14571              :    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
   14572              :    had a language-level syntax for referencing vector elements.  */
   14573              : 
   14574              : static rtx
   14575          399 : ix86_expand_vec_ext_builtin (tree exp, rtx target)
   14576              : {
   14577          399 :   machine_mode tmode, mode0;
   14578          399 :   tree arg0, arg1;
   14579          399 :   int elt;
   14580          399 :   rtx op0;
   14581              : 
   14582          399 :   arg0 = CALL_EXPR_ARG (exp, 0);
   14583          399 :   arg1 = CALL_EXPR_ARG (exp, 1);
   14584              : 
   14585          399 :   op0 = expand_normal (arg0);
   14586          399 :   elt = get_element_number (TREE_TYPE (arg0), arg1);
   14587              : 
   14588          399 :   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
   14589          399 :   mode0 = TYPE_MODE (TREE_TYPE (arg0));
   14590          399 :   gcc_assert (VECTOR_MODE_P (mode0));
   14591              : 
   14592          399 :   op0 = force_reg (mode0, op0);
   14593              : 
   14594          399 :   if (optimize || !target || !register_operand (target, tmode))
   14595          320 :     target = gen_reg_rtx (tmode);
   14596              : 
   14597          399 :   ix86_expand_vector_extract (true, target, op0, elt);
   14598              : 
   14599          399 :   return target;
   14600              : }
   14601              : 
   14602              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14603              :    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
   14604              :    a language-level syntax for referencing vector elements.  */
   14605              : 
   14606              : static rtx
   14607          204 : ix86_expand_vec_set_builtin (tree exp)
   14608              : {
   14609          204 :   machine_mode tmode, mode1;
   14610          204 :   tree arg0, arg1, arg2;
   14611          204 :   int elt;
   14612          204 :   rtx op0, op1, target;
   14613              : 
   14614          204 :   arg0 = CALL_EXPR_ARG (exp, 0);
   14615          204 :   arg1 = CALL_EXPR_ARG (exp, 1);
   14616          204 :   arg2 = CALL_EXPR_ARG (exp, 2);
   14617              : 
   14618          204 :   tmode = TYPE_MODE (TREE_TYPE (arg0));
   14619          204 :   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
   14620          204 :   gcc_assert (VECTOR_MODE_P (tmode));
   14621              : 
   14622          204 :   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
   14623          204 :   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
   14624          204 :   elt = get_element_number (TREE_TYPE (arg0), arg2);
   14625              : 
   14626          204 :   if (GET_MODE (op1) != mode1)
   14627           82 :     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
   14628              : 
   14629          204 :   op0 = force_reg (tmode, op0);
   14630          204 :   op1 = force_reg (mode1, op1);
   14631              : 
   14632              :   /* OP0 is the source of these builtin functions and shouldn't be
   14633              :      modified.  Create a copy, use it and return it as target.  */
   14634          204 :   target = gen_reg_rtx (tmode);
   14635          204 :   emit_move_insn (target, op0);
   14636          204 :   ix86_expand_vector_set (true, target, op1, elt);
   14637              : 
   14638          204 :   return target;
   14639              : }
   14640              : 
   14641              : /* Return true if the necessary isa options for this builtin exist,
   14642              :    else false.
   14643              :    fcode = DECL_MD_FUNCTION_CODE (fndecl);  */
   14644              : bool
   14645      1294782 : ix86_check_builtin_isa_match (unsigned int fcode,
   14646              :                               HOST_WIDE_INT* pbisa,
   14647              :                               HOST_WIDE_INT* pbisa2)
   14648              : {
   14649      1294782 :   HOST_WIDE_INT isa = ix86_isa_flags;
   14650      1294782 :   HOST_WIDE_INT isa2 = ix86_isa_flags2;
   14651      1294782 :   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
   14652      1294782 :   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
   14653      1294782 :   HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
   14654              :   /* The general case is we require all the ISAs specified in bisa{,2}
   14655              :      to be enabled.
   14656              :      The exceptions are:
   14657              :      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
   14658              :      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
   14659              :      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
   14660              :      (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
   14661              :        OPTION_MASK_ISA2_AVXVNNI
   14662              :      (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
   14663              :        OPTION_MASK_ISA2_AVXIFMA
   14664              :      (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
   14665              :        OPTION_MASK_ISA2_AVXNECONVERT
   14666              :      OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
   14667              :      OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT8
   14668              :      OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT16
   14669              :      where for each such pair it is sufficient if either of the ISAs is
   14670              :      enabled, plus if it is ored with other options also those others.
   14671              :      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
   14672              : 
   14673              : #define SHARE_BUILTIN(A1, A2, B1, B2) \
   14674              :   if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
   14675              :        && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
   14676              :       && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
   14677              :           || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
   14678              :     { \
   14679              :       tmp_isa |= (A1) | (B1); \
   14680              :       tmp_isa2 |= (A2) | (B2); \
   14681              :     }
   14682              : 
   14683      1294782 :   SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
   14684      1294782 :   SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
   14685      1294782 :   SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
   14686      1294782 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
   14687      1294782 :                  OPTION_MASK_ISA2_AVXVNNI);
   14688      1294782 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
   14689      1294782 :                  OPTION_MASK_ISA2_AVXIFMA);
   14690      1294782 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
   14691      1294782 :                  OPTION_MASK_ISA2_AVXNECONVERT);
   14692      1294782 :   SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
   14693      1294782 :                  OPTION_MASK_ISA2_VAES);
   14694      1294782 :   SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT8, 0,
   14695      1294782 :                  OPTION_MASK_ISA2_AVX10_2);
   14696      1294782 :   SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT16, 0,
   14697      1294782 :                  OPTION_MASK_ISA2_AVX10_2);
   14698      1294782 :   isa = tmp_isa;
   14699      1294782 :   isa2 = tmp_isa2;
   14700              : 
   14701      1294782 :   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
   14702              :       /* __builtin_ia32_maskmovq requires MMX registers.  */
   14703         4563 :       && fcode != IX86_BUILTIN_MASKMOVQ)
   14704              :     {
   14705         4554 :       bisa &= ~OPTION_MASK_ISA_MMX;
   14706         4554 :       bisa |= OPTION_MASK_ISA_SSE2;
   14707              :     }
   14708              : 
   14709      1294782 :   if (pbisa)
   14710       173271 :     *pbisa = bisa;
   14711      1294782 :   if (pbisa2)
   14712       173271 :     *pbisa2 = bisa2;
   14713              : 
   14714      1294782 :   return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
   14715              : }
   14716              : 
   14717              : /* Emit instructions to set the carry flag from ARG.  */
   14718              : 
   14719              : void
   14720        13560 : ix86_expand_carry (rtx arg)
   14721              : {
   14722        13560 :   if (!CONST_INT_P (arg) || arg == const0_rtx)
   14723              :     {
   14724        13554 :       arg = convert_to_mode (QImode, arg, 1);
   14725        13554 :       arg = copy_to_mode_reg (QImode, arg);
   14726        13554 :       emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
   14727              :     }
   14728              :   else
   14729            6 :     emit_insn (gen_x86_stc ());
   14730        13560 : }
   14731              : 
   14732              : /* Expand an expression EXP that calls a built-in function,
   14733              :    with result going to TARGET if that's convenient
   14734              :    (and in mode MODE if that's convenient).
   14735              :    SUBTARGET may be used as the target for computing one of EXP's operands.
   14736              :    IGNORE is nonzero if the value is to be ignored.  */
   14737              : 
   14738              : rtx
   14739       174062 : ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
   14740              :                      machine_mode mode, int ignore)
   14741              : {
   14742       174062 :   size_t i;
   14743       174062 :   enum insn_code icode, icode2;
   14744       174062 :   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   14745       174062 :   tree arg0, arg1, arg2, arg3, arg4;
   14746       174062 :   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
   14747       174062 :   machine_mode mode0, mode1, mode2, mode3, mode4;
   14748       174062 :   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
   14749       174062 :   HOST_WIDE_INT bisa, bisa2;
   14750              : 
   14751              :   /* For CPU builtins that can be folded, fold first and expand the fold.  */
   14752       174062 :   switch (fcode)
   14753              :     {
   14754          196 :     case IX86_BUILTIN_CPU_INIT:
   14755          196 :       {
   14756              :         /* Make it call __cpu_indicator_init in libgcc.  */
   14757          196 :         tree call_expr, fndecl, type;
   14758          196 :         type = build_function_type_list (integer_type_node, NULL_TREE);
   14759          196 :         fndecl = build_fn_decl ("__cpu_indicator_init", type);
   14760          196 :         call_expr = build_call_expr (fndecl, 0);
   14761          196 :         return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
   14762              :       }
   14763          595 :     case IX86_BUILTIN_CPU_IS:
   14764          595 :     case IX86_BUILTIN_CPU_SUPPORTS:
   14765          595 :       {
   14766          595 :         tree arg0 = CALL_EXPR_ARG (exp, 0);
   14767          595 :         tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
   14768          595 :         gcc_assert (fold_expr != NULL_TREE);
   14769          595 :         return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
   14770              :       }
   14771              :     }
   14772              : 
   14773       173271 :   if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
   14774              :     {
   14775           23 :       bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
   14776           23 :       if (TARGET_ABI_X32)
   14777            0 :         bisa |= OPTION_MASK_ABI_X32;
   14778              :       else
   14779           23 :         bisa |= OPTION_MASK_ABI_64;
   14780           23 :       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
   14781              :                                        (enum fpmath_unit) 0,
   14782              :                                        (enum prefer_vector_width) 0,
   14783              :                                        PVW_NONE, false, add_abi_p);
   14784           23 :       if (!opts)
   14785            0 :         error ("%qE needs unknown isa option", fndecl);
   14786              :       else
   14787              :         {
   14788           23 :           gcc_assert (opts != NULL);
   14789           23 :           error ("%qE needs isa option %s", fndecl, opts);
   14790           23 :           free (opts);
   14791              :         }
   14792           23 :       return expand_call (exp, target, ignore);
   14793              :     }
   14794              : 
   14795       173248 :   switch (fcode)
   14796              :     {
   14797           35 :     case IX86_BUILTIN_MASKMOVQ:
   14798           35 :     case IX86_BUILTIN_MASKMOVDQU:
   14799           34 :       icode = (fcode == IX86_BUILTIN_MASKMOVQ
   14800           35 :                ? CODE_FOR_mmx_maskmovq
   14801              :                : CODE_FOR_sse2_maskmovdqu);
   14802              :       /* Note the arg order is different from the operand order.  */
   14803           35 :       arg1 = CALL_EXPR_ARG (exp, 0);
   14804           35 :       arg2 = CALL_EXPR_ARG (exp, 1);
   14805           35 :       arg0 = CALL_EXPR_ARG (exp, 2);
   14806           35 :       op0 = expand_normal (arg0);
   14807           35 :       op1 = expand_normal (arg1);
   14808           35 :       op2 = expand_normal (arg2);
   14809           35 :       mode0 = insn_data[icode].operand[0].mode;
   14810           35 :       mode1 = insn_data[icode].operand[1].mode;
   14811           35 :       mode2 = insn_data[icode].operand[2].mode;
   14812              : 
   14813           35 :       op0 = ix86_zero_extend_to_Pmode (op0);
   14814           35 :       op0 = gen_rtx_MEM (mode1, op0);
   14815              : 
   14816           35 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   14817            0 :         op0 = copy_to_mode_reg (mode0, op0);
   14818           35 :       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   14819            2 :         op1 = copy_to_mode_reg (mode1, op1);
   14820           35 :       if (!insn_data[icode].operand[2].predicate (op2, mode2))
   14821            2 :         op2 = copy_to_mode_reg (mode2, op2);
   14822           35 :       pat = GEN_FCN (icode) (op0, op1, op2);
   14823           35 :       if (! pat)
   14824        56621 :         return 0;
   14825           35 :       emit_insn (pat);
   14826           35 :       return 0;
   14827              : 
   14828        22008 :     case IX86_BUILTIN_LDMXCSR:
   14829        22008 :       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
   14830        22008 :       target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
   14831        22008 :       emit_move_insn (target, op0);
   14832        22008 :       emit_insn (gen_sse_ldmxcsr (target));
   14833        22008 :       return 0;
   14834              : 
   14835        14785 :     case IX86_BUILTIN_STMXCSR:
   14836        14785 :       target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
   14837        14785 :       emit_insn (gen_sse_stmxcsr (target));
   14838        14785 :       return copy_to_mode_reg (SImode, target);
   14839              : 
   14840           11 :     case IX86_BUILTIN_CLFLUSH:
   14841           11 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14842           11 :         op0 = expand_normal (arg0);
   14843           11 :         icode = CODE_FOR_sse2_clflush;
   14844           11 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14845            5 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14846              : 
   14847           11 :         emit_insn (gen_sse2_clflush (op0));
   14848           11 :         return 0;
   14849              : 
   14850           19 :     case IX86_BUILTIN_CLWB:
   14851           19 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14852           19 :         op0 = expand_normal (arg0);
   14853           19 :         icode = CODE_FOR_clwb;
   14854           19 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14855            9 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14856              : 
   14857           19 :         emit_insn (gen_clwb (op0));
   14858           19 :         return 0;
   14859              : 
   14860           19 :     case IX86_BUILTIN_CLFLUSHOPT:
   14861           19 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14862           19 :         op0 = expand_normal (arg0);
   14863           19 :         icode = CODE_FOR_clflushopt;
   14864           19 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14865            9 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14866              : 
   14867           19 :         emit_insn (gen_clflushopt (op0));
   14868           19 :         return 0;
   14869              : 
   14870           47 :     case IX86_BUILTIN_MONITOR:
   14871           47 :     case IX86_BUILTIN_MONITORX:
   14872           47 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14873           47 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14874           47 :       arg2 = CALL_EXPR_ARG (exp, 2);
   14875           47 :       op0 = expand_normal (arg0);
   14876           47 :       op1 = expand_normal (arg1);
   14877           47 :       op2 = expand_normal (arg2);
   14878           47 :       if (!REG_P (op0))
   14879           19 :         op0 = ix86_zero_extend_to_Pmode (op0);
   14880           47 :       if (!REG_P (op1))
   14881           22 :         op1 = copy_to_mode_reg (SImode, op1);
   14882           47 :       if (!REG_P (op2))
   14883           25 :         op2 = copy_to_mode_reg (SImode, op2);
   14884              : 
   14885           47 :       emit_insn (fcode == IX86_BUILTIN_MONITOR
   14886           26 :                  ? gen_sse3_monitor (Pmode, op0, op1, op2)
   14887           21 :                  : gen_monitorx (Pmode, op0, op1, op2));
   14888           47 :       return 0;
   14889              : 
   14890           25 :     case IX86_BUILTIN_MWAIT:
   14891           25 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14892           25 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14893           25 :       op0 = expand_normal (arg0);
   14894           25 :       op1 = expand_normal (arg1);
   14895           25 :       if (!REG_P (op0))
   14896           13 :         op0 = copy_to_mode_reg (SImode, op0);
   14897           25 :       if (!REG_P (op1))
   14898           11 :         op1 = copy_to_mode_reg (SImode, op1);
   14899           25 :       emit_insn (gen_sse3_mwait (op0, op1));
   14900           25 :       return 0;
   14901              : 
   14902           21 :     case IX86_BUILTIN_MWAITX:
   14903           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14904           21 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14905           21 :       arg2 = CALL_EXPR_ARG (exp, 2);
   14906           21 :       op0 = expand_normal (arg0);
   14907           21 :       op1 = expand_normal (arg1);
   14908           21 :       op2 = expand_normal (arg2);
   14909           21 :       if (!REG_P (op0))
   14910           11 :         op0 = copy_to_mode_reg (SImode, op0);
   14911           21 :       if (!REG_P (op1))
   14912           10 :         op1 = copy_to_mode_reg (SImode, op1);
   14913           21 :       if (!REG_P (op2))
   14914           11 :         op2 = copy_to_mode_reg (SImode, op2);
   14915           21 :       emit_insn (gen_mwaitx (op0, op1, op2));
   14916           21 :       return 0;
   14917              : 
   14918           21 :     case IX86_BUILTIN_UMONITOR:
   14919           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14920           21 :       op0 = expand_normal (arg0);
   14921              : 
   14922           21 :       op0 = ix86_zero_extend_to_Pmode (op0);
   14923           21 :       emit_insn (gen_umonitor (Pmode, op0));
   14924           21 :       return 0;
   14925              : 
   14926           42 :     case IX86_BUILTIN_UMWAIT:
   14927           42 :     case IX86_BUILTIN_TPAUSE:
   14928           42 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14929           42 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14930           42 :       op0 = expand_normal (arg0);
   14931           42 :       op1 = expand_normal (arg1);
   14932              : 
   14933           42 :       if (!REG_P (op0))
   14934           20 :         op0 = copy_to_mode_reg (SImode, op0);
   14935              : 
   14936           42 :       op1 = force_reg (DImode, op1);
   14937              : 
   14938           42 :       if (TARGET_64BIT)
   14939              :         {
   14940           42 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   14941              :                                      NULL, 1, OPTAB_DIRECT);
   14942           42 :           switch (fcode)
   14943              :             {
   14944              :             case IX86_BUILTIN_UMWAIT:
   14945              :               icode = CODE_FOR_umwait_rex64;
   14946              :               break;
   14947           21 :             case IX86_BUILTIN_TPAUSE:
   14948           21 :               icode = CODE_FOR_tpause_rex64;
   14949           21 :               break;
   14950            0 :             default:
   14951            0 :               gcc_unreachable ();
   14952              :             }
   14953              : 
   14954           42 :           op2 = gen_lowpart (SImode, op2);
   14955           42 :           op1 = gen_lowpart (SImode, op1);
   14956           42 :           pat = GEN_FCN (icode) (op0, op1, op2);
   14957              :         }
   14958              :       else
   14959              :         {
   14960            0 :           switch (fcode)
   14961              :             {
   14962              :             case IX86_BUILTIN_UMWAIT:
   14963              :               icode = CODE_FOR_umwait;
   14964              :               break;
   14965            0 :             case IX86_BUILTIN_TPAUSE:
   14966            0 :               icode = CODE_FOR_tpause;
   14967            0 :               break;
   14968            0 :             default:
   14969            0 :               gcc_unreachable ();
   14970              :             }
   14971            0 :           pat = GEN_FCN (icode) (op0, op1);
   14972              :         }
   14973              : 
   14974           42 :       if (!pat)
   14975              :         return 0;
   14976              : 
   14977           42 :       emit_insn (pat);
   14978              : 
   14979           42 :       if (target == 0
   14980           42 :           || !register_operand (target, QImode))
   14981            0 :         target = gen_reg_rtx (QImode);
   14982              : 
   14983           42 :       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   14984              :                         const0_rtx);
   14985           42 :       emit_insn (gen_rtx_SET (target, pat));
   14986              : 
   14987           42 :       return target;
   14988              : 
   14989           20 :     case IX86_BUILTIN_TESTUI:
   14990           20 :       emit_insn (gen_testui ());
   14991              : 
   14992           20 :       if (target == 0
   14993           20 :           || !register_operand (target, QImode))
   14994            0 :         target = gen_reg_rtx (QImode);
   14995              : 
   14996           20 :       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   14997              :                          const0_rtx);
   14998           20 :       emit_insn (gen_rtx_SET (target, pat));
   14999              : 
   15000           20 :       return target;
   15001              : 
   15002           19 :     case IX86_BUILTIN_CLZERO:
   15003           19 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15004           19 :       op0 = expand_normal (arg0);
   15005           19 :       if (!REG_P (op0))
   15006            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   15007           19 :       emit_insn (gen_clzero (Pmode, op0));
   15008           19 :       return 0;
   15009              : 
   15010           19 :     case IX86_BUILTIN_CLDEMOTE:
   15011           19 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15012           19 :       op0 = expand_normal (arg0);
   15013           19 :       icode = CODE_FOR_cldemote;
   15014           19 :       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   15015            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   15016              : 
   15017           19 :       emit_insn (gen_cldemote (op0));
   15018           19 :       return 0;
   15019              : 
   15020           11 :     case IX86_BUILTIN_LOADIWKEY:
   15021           11 :       {
   15022           11 :         arg0 = CALL_EXPR_ARG (exp, 0);
   15023           11 :         arg1 = CALL_EXPR_ARG (exp, 1);
   15024           11 :         arg2 = CALL_EXPR_ARG (exp, 2);
   15025           11 :         arg3 = CALL_EXPR_ARG (exp, 3);
   15026              : 
   15027           11 :         op0 = expand_normal (arg0);
   15028           11 :         op1 = expand_normal (arg1);
   15029           11 :         op2 = expand_normal (arg2);
   15030           11 :         op3 = expand_normal (arg3);
   15031              : 
   15032           11 :         if (!REG_P (op0))
   15033            5 :           op0 = copy_to_mode_reg (V2DImode, op0);
   15034           11 :         if (!REG_P (op1))
   15035            5 :           op1 = copy_to_mode_reg (V2DImode, op1);
   15036           11 :         if (!REG_P (op2))
   15037            5 :           op2 = copy_to_mode_reg (V2DImode, op2);
   15038           11 :         if (!REG_P (op3))
   15039            5 :           op3 = copy_to_mode_reg (SImode, op3);
   15040              : 
   15041           11 :         emit_insn (gen_loadiwkey (op0, op1, op2, op3));
   15042              : 
   15043           11 :         return 0;
   15044              :       }
   15045              : 
   15046           12 :     case IX86_BUILTIN_AESDEC128KLU8:
   15047           12 :       icode = CODE_FOR_aesdec128klu8;
   15048           12 :       goto aesdecenc_expand;
   15049              : 
   15050           12 :     case IX86_BUILTIN_AESDEC256KLU8:
   15051           12 :       icode = CODE_FOR_aesdec256klu8;
   15052           12 :       goto aesdecenc_expand;
   15053              : 
   15054           12 :     case IX86_BUILTIN_AESENC128KLU8:
   15055           12 :       icode = CODE_FOR_aesenc128klu8;
   15056           12 :       goto aesdecenc_expand;
   15057              : 
   15058              :     case IX86_BUILTIN_AESENC256KLU8:
   15059              :       icode = CODE_FOR_aesenc256klu8;
   15060              : 
   15061           48 :     aesdecenc_expand:
   15062              : 
   15063           48 :       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
   15064           48 :       arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
   15065           48 :       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
   15066              : 
   15067           48 :       op0 = expand_normal (arg0);
   15068           48 :       op1 = expand_normal (arg1);
   15069           48 :       op2 = expand_normal (arg2);
   15070              : 
   15071           48 :       if (!address_operand (op0, V2DImode))
   15072              :         {
   15073           16 :           op0 = convert_memory_address (Pmode, op0);
   15074           16 :           op0 = copy_addr_to_reg (op0);
   15075              :         }
   15076           48 :       op0 = gen_rtx_MEM (V2DImode, op0);
   15077              : 
   15078           48 :       if (!REG_P (op1))
   15079           20 :         op1 = copy_to_mode_reg (V2DImode, op1);
   15080              : 
   15081           48 :       if (!address_operand (op2, VOIDmode))
   15082              :         {
   15083           16 :           op2 = convert_memory_address (Pmode, op2);
   15084           16 :           op2 = copy_addr_to_reg (op2);
   15085              :         }
   15086           48 :       op2 = gen_rtx_MEM (BLKmode, op2);
   15087              : 
   15088           48 :       emit_insn (GEN_FCN (icode) (op1, op1, op2));
   15089              : 
   15090           48 :       if (target == 0)
   15091            4 :         target = gen_reg_rtx (QImode);
   15092              : 
   15093              :       /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
   15094              :          error occurs. Then the output should be cleared for safety. */
   15095           48 :       rtx_code_label *ok_label;
   15096           48 :       rtx tmp;
   15097              : 
   15098           48 :       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
   15099           48 :       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
   15100           48 :       ok_label = gen_label_rtx ();
   15101           48 :       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
   15102              :                                true, ok_label);
   15103              :       /* Usually the runtime error seldom occur, so predict OK path as
   15104              :          hotspot to optimize it as fallthrough block. */
   15105           48 :       predict_jump (REG_BR_PROB_BASE * 90 / 100);
   15106              : 
   15107           48 :       emit_insn (gen_rtx_SET (op1, const0_rtx));
   15108              : 
   15109           48 :       emit_label (ok_label);
   15110           48 :       emit_insn (gen_rtx_SET (target, pat));
   15111           48 :       emit_insn (gen_rtx_SET (op0, op1));
   15112              : 
   15113           48 :       return target;
   15114              : 
   15115           11 :     case IX86_BUILTIN_AESDECWIDE128KLU8:
   15116           11 :       icode = CODE_FOR_aesdecwide128klu8;
   15117           11 :       goto wideaesdecenc_expand;
   15118              : 
   15119           11 :     case IX86_BUILTIN_AESDECWIDE256KLU8:
   15120           11 :       icode = CODE_FOR_aesdecwide256klu8;
   15121           11 :       goto wideaesdecenc_expand;
   15122              : 
   15123           11 :     case IX86_BUILTIN_AESENCWIDE128KLU8:
   15124           11 :       icode = CODE_FOR_aesencwide128klu8;
   15125           11 :       goto wideaesdecenc_expand;
   15126              : 
   15127              :     case IX86_BUILTIN_AESENCWIDE256KLU8:
   15128              :       icode = CODE_FOR_aesencwide256klu8;
   15129              : 
   15130           44 :     wideaesdecenc_expand:
   15131              : 
   15132           44 :       rtx xmm_regs[8];
   15133           44 :       rtx op;
   15134              : 
   15135           44 :       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
   15136           44 :       arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
   15137           44 :       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
   15138              : 
   15139           44 :       op0 = expand_normal (arg0);
   15140           44 :       op1 = expand_normal (arg1);
   15141           44 :       op2 = expand_normal (arg2);
   15142              : 
   15143           44 :       if (GET_MODE (op1) != Pmode)
   15144            0 :         op1 = convert_to_mode (Pmode, op1, 1);
   15145              : 
   15146           44 :       if (!address_operand (op2, VOIDmode))
   15147              :         {
   15148           16 :           op2 = convert_memory_address (Pmode, op2);
   15149           16 :           op2 = copy_addr_to_reg (op2);
   15150              :         }
   15151           44 :       op2 = gen_rtx_MEM (BLKmode, op2);
   15152              : 
   15153          440 :       for (i = 0; i < 8; i++)
   15154              :         {
   15155          352 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15156              : 
   15157          352 :           op = gen_rtx_MEM (V2DImode,
   15158          352 :                             plus_constant (Pmode, op1, (i * 16)));
   15159              : 
   15160          352 :           emit_move_insn (xmm_regs[i], op);
   15161              :         }
   15162              : 
   15163           44 :       emit_insn (GEN_FCN (icode) (op2));
   15164              : 
   15165           44 :       if (target == 0)
   15166            0 :         target = gen_reg_rtx (QImode);
   15167              : 
   15168           44 :       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
   15169           44 :       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
   15170           44 :       ok_label = gen_label_rtx ();
   15171           44 :       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
   15172              :                                true, ok_label);
   15173           44 :       predict_jump (REG_BR_PROB_BASE * 90 / 100);
   15174              : 
   15175          440 :       for (i = 0; i < 8; i++)
   15176          352 :         emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
   15177              : 
   15178           44 :       emit_label (ok_label);
   15179           44 :       emit_insn (gen_rtx_SET (target, pat));
   15180              : 
   15181           44 :       if (GET_MODE (op0) != Pmode)
   15182            0 :         op0 = convert_to_mode (Pmode, op0, 1);
   15183              : 
   15184          396 :       for (i = 0; i < 8; i++)
   15185              :         {
   15186          352 :           op = gen_rtx_MEM (V2DImode,
   15187          352 :                             plus_constant (Pmode, op0, (i * 16)));
   15188          352 :           emit_move_insn (op, xmm_regs[i]);
   15189              :         }
   15190              : 
   15191              :       return target;
   15192              : 
   15193           13 :     case IX86_BUILTIN_ENCODEKEY128U32:
   15194           13 :       {
   15195           13 :         rtx op, xmm_regs[7];
   15196              : 
   15197           13 :         arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
   15198           13 :         arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
   15199           13 :         arg2 = CALL_EXPR_ARG (exp, 2); // void *h
   15200              : 
   15201           13 :         op0 = expand_normal (arg0);
   15202           13 :         op1 = expand_normal (arg1);
   15203           13 :         op2 = expand_normal (arg2);
   15204              : 
   15205           13 :         if (!REG_P (op0))
   15206            7 :           op0 = copy_to_mode_reg (SImode, op0);
   15207              : 
   15208           13 :         if (GET_MODE (op2) != Pmode)
   15209            1 :           op2 = convert_to_mode (Pmode, op2, 1);
   15210              : 
   15211           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
   15212           13 :         emit_move_insn (op, op1);
   15213              : 
   15214           65 :         for (i = 0; i < 3; i++)
   15215           39 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15216              : 
   15217           13 :         if (target == 0 || !register_operand (target, SImode))
   15218            2 :           target = gen_reg_rtx (SImode);
   15219              : 
   15220           13 :         emit_insn (gen_encodekey128u32 (target, op0));
   15221              : 
   15222           65 :         for (i = 0; i < 3; i++)
   15223              :           {
   15224           39 :             op = gen_rtx_MEM (V2DImode,
   15225           39 :                               plus_constant (Pmode, op2, (i * 16)));
   15226           39 :             emit_move_insn (op, xmm_regs[i]);
   15227              :           }
   15228              : 
   15229           13 :         return target;
   15230              :       }
   15231           13 :     case IX86_BUILTIN_ENCODEKEY256U32:
   15232           13 :       {
   15233           13 :         rtx op, xmm_regs[7];
   15234              : 
   15235           13 :         arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
   15236           13 :         arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
   15237           13 :         arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
   15238           13 :         arg3 = CALL_EXPR_ARG (exp, 3); // void *h
   15239              : 
   15240           13 :         op0 = expand_normal (arg0);
   15241           13 :         op1 = expand_normal (arg1);
   15242           13 :         op2 = expand_normal (arg2);
   15243           13 :         op3 = expand_normal (arg3);
   15244              : 
   15245           13 :         if (!REG_P (op0))
   15246            7 :           op0 = copy_to_mode_reg (SImode, op0);
   15247              : 
   15248           13 :         if (GET_MODE (op3) != Pmode)
   15249            1 :           op3 = convert_to_mode (Pmode, op3, 1);
   15250              : 
   15251              :         /* Force to use xmm0, xmm1 for keylow, keyhi*/
   15252           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
   15253           13 :         emit_move_insn (op, op1);
   15254           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
   15255           13 :         emit_move_insn (op, op2);
   15256              : 
   15257           78 :         for (i = 0; i < 4; i++)
   15258           52 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15259              : 
   15260           13 :         if (target == 0 || !register_operand (target, SImode))
   15261            2 :           target = gen_reg_rtx (SImode);
   15262              : 
   15263           13 :         emit_insn (gen_encodekey256u32 (target, op0));
   15264              : 
   15265           78 :         for (i = 0; i < 4; i++)
   15266              :           {
   15267           52 :             op = gen_rtx_MEM (V2DImode,
   15268           52 :                               plus_constant (Pmode, op3, (i * 16)));
   15269           52 :             emit_move_insn (op, xmm_regs[i]);
   15270              :           }
   15271              : 
   15272           13 :         return target;
   15273              :       }
   15274              : 
   15275           48 :     case IX86_BUILTIN_PREFETCH:
   15276           48 :       {
   15277           48 :         arg0 = CALL_EXPR_ARG (exp, 0); // const void *
   15278           48 :         arg1 = CALL_EXPR_ARG (exp, 1); // const int
   15279           48 :         arg2 = CALL_EXPR_ARG (exp, 2); // const int
   15280           48 :         arg3 = CALL_EXPR_ARG (exp, 3); // const int
   15281              : 
   15282           48 :         op0 = expand_normal (arg0);
   15283           48 :         op1 = expand_normal (arg1);
   15284           48 :         op2 = expand_normal (arg2);
   15285           48 :         op3 = expand_normal (arg3);
   15286              : 
   15287           48 :         if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
   15288              :           {
   15289            0 :             error ("second, third and fourth argument must be a const");
   15290            0 :             return const0_rtx;
   15291              :           }
   15292              : 
   15293           48 :         if (!IN_RANGE (INTVAL (op1), 0, 2))
   15294              :           {
   15295            1 :             warning (0, "invalid second argument to"
   15296              :                      " %<__builtin_ia32_prefetch%>; using zero");
   15297            1 :             op1 = const0_rtx;
   15298              :           }
   15299              : 
   15300           48 :         if (INTVAL (op3) == 1)
   15301              :           {
   15302            4 :             if (!IN_RANGE (INTVAL (op2), 2, 3))
   15303              :               {
   15304            1 :                 error ("invalid third argument");
   15305            1 :                 return const0_rtx;
   15306              :               }
   15307              : 
   15308            3 :             if (TARGET_64BIT && TARGET_PREFETCHI
   15309            6 :                 && local_func_symbolic_operand (op0, GET_MODE (op0)))
   15310            2 :               emit_insn (gen_prefetchi (op0, op2));
   15311              :             else
   15312              :               {
   15313            1 :                 warning (0, "instruction prefetch applies when in 64-bit mode"
   15314              :                             " with RIP-relative addressing and"
   15315              :                             " option %<-mprefetchi%>;"
   15316              :                             " they stay NOPs otherwise");
   15317            1 :                 emit_insn (gen_nop ());
   15318              :               }
   15319              :           }
   15320              :         else
   15321              :           {
   15322           44 :             if (INTVAL (op3) != 0)
   15323            1 :               warning (0, "invalid fourth argument to"
   15324              :                           " %<__builtin_ia32_prefetch%>; using zero");
   15325              : 
   15326           44 :             if (!address_operand (op0, VOIDmode))
   15327              :               {
   15328           10 :                 op0 = convert_memory_address (Pmode, op0);
   15329           10 :                 op0 = copy_addr_to_reg (op0);
   15330              :               }
   15331              : 
   15332           44 :             if (!IN_RANGE (INTVAL (op2), 0, 3))
   15333              :               {
   15334            1 :                 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
   15335            1 :                 op2 = const0_rtx;
   15336              :               }
   15337              : 
   15338           44 :             if (TARGET_3DNOW
   15339           26 :                 || TARGET_PREFETCH_SSE
   15340            0 :                 || TARGET_PRFCHW
   15341            0 :                 || TARGET_MOVRS)
   15342           44 :               emit_insn (gen_prefetch (op0, op1, op2));
   15343            0 :             else if (!MEM_P (op0) && side_effects_p (op0))
   15344              :               /* Don't do anything with direct references to volatile memory,
   15345              :                  but generate code to handle other side effects.  */
   15346            0 :               emit_insn (op0);
   15347              :           }
   15348              : 
   15349              :         return 0;
   15350              :       }
   15351              : 
   15352           21 :     case IX86_BUILTIN_PREFETCHI:
   15353           21 :       {
   15354           21 :         arg0 = CALL_EXPR_ARG (exp, 0); // const void *
   15355           21 :         arg1 = CALL_EXPR_ARG (exp, 1); // const int
   15356              : 
   15357           21 :         op0 = expand_normal (arg0);
   15358           21 :         op1 = expand_normal (arg1);
   15359              : 
   15360           21 :         if (!CONST_INT_P (op1))
   15361              :           {
   15362            0 :             error ("second argument must be a const");
   15363            0 :             return const0_rtx;
   15364              :           }
   15365              : 
   15366              :         /* GOT/PLT_PIC should not be available for instruction prefetch.
   15367              :            It must be real instruction address.  */
   15368           21 :         if (TARGET_64BIT
   15369           21 :             && local_func_symbolic_operand (op0, GET_MODE (op0)))
   15370            4 :           emit_insn (gen_prefetchi (op0, op1));
   15371              :         else
   15372              :           {
   15373              :             /* Ignore the hint.  */
   15374           17 :             warning (0, "instruction prefetch applies when in 64-bit mode"
   15375              :                         " with RIP-relative addressing and"
   15376              :                         " option %<-mprefetchi%>;"
   15377              :                         " they stay NOPs otherwise");
   15378           17 :             emit_insn (gen_nop ());
   15379              :           }
   15380              : 
   15381              :         return 0;
   15382              :       }
   15383              : 
   15384           53 :     case IX86_BUILTIN_URDMSR:
   15385           53 :     case IX86_BUILTIN_UWRMSR:
   15386           53 :       {
   15387           53 :         arg0 = CALL_EXPR_ARG (exp, 0);
   15388           53 :         op0 = expand_normal (arg0);
   15389              : 
   15390           53 :         if (CONST_INT_P (op0))
   15391              :           {
   15392           12 :             unsigned HOST_WIDE_INT val = UINTVAL (op0);
   15393           12 :             if (val > 0xffffffff)
   15394            2 :               op0 = force_reg (DImode, op0);
   15395              :           }
   15396              :         else
   15397           41 :           op0 = force_reg (DImode, op0);
   15398              : 
   15399           53 :         if (fcode == IX86_BUILTIN_UWRMSR)
   15400              :           {
   15401           26 :             arg1 = CALL_EXPR_ARG (exp, 1);
   15402           26 :             op1 = expand_normal (arg1);
   15403           26 :             op1 = force_reg (DImode, op1);
   15404           26 :             icode = CODE_FOR_uwrmsr;
   15405           26 :             target = 0;
   15406              :           }
   15407              :         else
   15408              :           {
   15409           27 :             if (target == 0 || !register_operand (target, DImode))
   15410            1 :               target = gen_reg_rtx (DImode);
   15411              :             icode = CODE_FOR_urdmsr;
   15412              :             op1 = op0;
   15413              :             op0 = target;
   15414              :           }
   15415           53 :         emit_insn (GEN_FCN (icode) (op0, op1));
   15416           53 :         return target;
   15417              :       }
   15418              : 
   15419          229 :     case IX86_BUILTIN_VEC_INIT_V2SI:
   15420          229 :     case IX86_BUILTIN_VEC_INIT_V4HI:
   15421          229 :     case IX86_BUILTIN_VEC_INIT_V8QI:
   15422          229 :       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
   15423              : 
   15424          399 :     case IX86_BUILTIN_VEC_EXT_V2DF:
   15425          399 :     case IX86_BUILTIN_VEC_EXT_V2DI:
   15426          399 :     case IX86_BUILTIN_VEC_EXT_V4SF:
   15427          399 :     case IX86_BUILTIN_VEC_EXT_V4SI:
   15428          399 :     case IX86_BUILTIN_VEC_EXT_V8HI:
   15429          399 :     case IX86_BUILTIN_VEC_EXT_V2SI:
   15430          399 :     case IX86_BUILTIN_VEC_EXT_V4HI:
   15431          399 :     case IX86_BUILTIN_VEC_EXT_V16QI:
   15432          399 :       return ix86_expand_vec_ext_builtin (exp, target);
   15433              : 
   15434          204 :     case IX86_BUILTIN_VEC_SET_V2DI:
   15435          204 :     case IX86_BUILTIN_VEC_SET_V4SF:
   15436          204 :     case IX86_BUILTIN_VEC_SET_V4SI:
   15437          204 :     case IX86_BUILTIN_VEC_SET_V8HI:
   15438          204 :     case IX86_BUILTIN_VEC_SET_V4HI:
   15439          204 :     case IX86_BUILTIN_VEC_SET_V16QI:
   15440          204 :       return ix86_expand_vec_set_builtin (exp);
   15441              : 
   15442            0 :     case IX86_BUILTIN_NANQ:
   15443            0 :     case IX86_BUILTIN_NANSQ:
   15444            0 :       return expand_call (exp, target, ignore);
   15445              : 
   15446           18 :     case IX86_BUILTIN_RDPID:
   15447              : 
   15448           18 :       op0 = gen_reg_rtx (word_mode);
   15449              : 
   15450           18 :       if (TARGET_64BIT)
   15451              :         {
   15452           18 :           insn = gen_rdpid_rex64 (op0);
   15453           18 :           op0 = convert_to_mode (SImode, op0, 1);
   15454              :         }
   15455              :       else
   15456            0 :         insn = gen_rdpid (op0);
   15457              : 
   15458           18 :       emit_insn (insn);
   15459              : 
   15460           18 :       if (target == 0
   15461           18 :           || !register_operand (target, SImode))
   15462            0 :         target = gen_reg_rtx (SImode);
   15463              : 
   15464           18 :       emit_move_insn (target, op0);
   15465           18 :       return target;
   15466              : 
   15467           75 :     case IX86_BUILTIN_2INTERSECTD512:
   15468           75 :     case IX86_BUILTIN_2INTERSECTQ512:
   15469           75 :     case IX86_BUILTIN_2INTERSECTD256:
   15470           75 :     case IX86_BUILTIN_2INTERSECTQ256:
   15471           75 :     case IX86_BUILTIN_2INTERSECTD128:
   15472           75 :     case IX86_BUILTIN_2INTERSECTQ128:
   15473           75 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15474           75 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15475           75 :       arg2 = CALL_EXPR_ARG (exp, 2);
   15476           75 :       arg3 = CALL_EXPR_ARG (exp, 3);
   15477           75 :       op0 = expand_normal (arg0);
   15478           75 :       op1 = expand_normal (arg1);
   15479           75 :       op2 = expand_normal (arg2);
   15480           75 :       op3 = expand_normal (arg3);
   15481              : 
   15482           75 :       if (!address_operand (op0, VOIDmode))
   15483              :         {
   15484           25 :           op0 = convert_memory_address (Pmode, op0);
   15485           25 :           op0 = copy_addr_to_reg (op0);
   15486              :         }
   15487           75 :       if (!address_operand (op1, VOIDmode))
   15488              :         {
   15489           25 :           op1 = convert_memory_address (Pmode, op1);
   15490           25 :           op1 = copy_addr_to_reg (op1);
   15491              :         }
   15492              : 
   15493           75 :       switch (fcode)
   15494              :         {
   15495              :         case IX86_BUILTIN_2INTERSECTD512:
   15496              :           mode4 = P2HImode;
   15497              :           icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
   15498              :           break;
   15499              :         case IX86_BUILTIN_2INTERSECTQ512:
   15500              :           mode4 = P2QImode;
   15501              :           icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
   15502              :           break;
   15503              :         case IX86_BUILTIN_2INTERSECTD256:
   15504              :           mode4 = P2QImode;
   15505              :           icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
   15506              :           break;
   15507              :         case IX86_BUILTIN_2INTERSECTQ256:
   15508              :           mode4 = P2QImode;
   15509              :           icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
   15510              :           break;
   15511              :         case IX86_BUILTIN_2INTERSECTD128:
   15512              :           mode4 = P2QImode;
   15513              :           icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
   15514              :           break;
   15515              :         case IX86_BUILTIN_2INTERSECTQ128:
   15516              :           mode4 = P2QImode;
   15517              :           icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
   15518              :           break;
   15519            0 :         default:
   15520            0 :           gcc_unreachable ();
   15521              :         }
   15522              : 
   15523           75 :       mode2 = insn_data[icode].operand[1].mode;
   15524           75 :       mode3 = insn_data[icode].operand[2].mode;
   15525           75 :       if (!insn_data[icode].operand[1].predicate (op2, mode2))
   15526           25 :         op2 = copy_to_mode_reg (mode2, op2);
   15527           75 :       if (!insn_data[icode].operand[2].predicate (op3, mode3))
   15528            6 :         op3 = copy_to_mode_reg (mode3, op3);
   15529              : 
   15530           75 :       op4 = gen_reg_rtx (mode4);
   15531           75 :       emit_insn (GEN_FCN (icode) (op4, op2, op3));
   15532           75 :       mode0 = mode4 == P2HImode ? HImode : QImode;
   15533           75 :       emit_move_insn (gen_rtx_MEM (mode0, op0),
   15534           75 :                       gen_lowpart (mode0, op4));
   15535           75 :       emit_move_insn (gen_rtx_MEM (mode0, op1),
   15536              :                       gen_highpart (mode0, op4));
   15537              : 
   15538           75 :       return 0;
   15539              : 
   15540          102 :     case IX86_BUILTIN_RDPMC:
   15541          102 :     case IX86_BUILTIN_RDTSC:
   15542          102 :     case IX86_BUILTIN_RDTSCP:
   15543          102 :     case IX86_BUILTIN_XGETBV:
   15544              : 
   15545          102 :       op0 = gen_reg_rtx (DImode);
   15546          102 :       op1 = gen_reg_rtx (DImode);
   15547              : 
   15548          102 :       if (fcode == IX86_BUILTIN_RDPMC)
   15549              :         {
   15550           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15551           22 :           op2 = expand_normal (arg0);
   15552           22 :           if (!register_operand (op2, SImode))
   15553           11 :             op2 = copy_to_mode_reg (SImode, op2);
   15554              : 
   15555           22 :           insn = (TARGET_64BIT
   15556           22 :                   ? gen_rdpmc_rex64 (op0, op1, op2)
   15557            0 :                   : gen_rdpmc (op0, op2));
   15558           22 :           emit_insn (insn);
   15559              :         }
   15560           80 :       else if (fcode == IX86_BUILTIN_XGETBV)
   15561              :         {
   15562           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15563           22 :           op2 = expand_normal (arg0);
   15564           22 :           if (!register_operand (op2, SImode))
   15565            1 :             op2 = copy_to_mode_reg (SImode, op2);
   15566              : 
   15567           22 :           insn = (TARGET_64BIT
   15568           22 :                   ? gen_xgetbv_rex64 (op0, op1, op2)
   15569            0 :                   : gen_xgetbv (op0, op2));
   15570           22 :           emit_insn (insn);
   15571              :         }
   15572           58 :       else if (fcode == IX86_BUILTIN_RDTSC)
   15573              :         {
   15574           36 :           insn = (TARGET_64BIT
   15575           36 :                   ? gen_rdtsc_rex64 (op0, op1)
   15576            2 :                   : gen_rdtsc (op0));
   15577           36 :           emit_insn (insn);
   15578              :         }
   15579              :       else
   15580              :         {
   15581           22 :           op2 = gen_reg_rtx (SImode);
   15582              : 
   15583           22 :           insn = (TARGET_64BIT
   15584           22 :                   ? gen_rdtscp_rex64 (op0, op1, op2)
   15585            0 :                   : gen_rdtscp (op0, op2));
   15586           22 :           emit_insn (insn);
   15587              : 
   15588           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15589           22 :           op4 = expand_normal (arg0);
   15590           22 :           if (!address_operand (op4, VOIDmode))
   15591              :             {
   15592           10 :               op4 = convert_memory_address (Pmode, op4);
   15593           10 :               op4 = copy_addr_to_reg (op4);
   15594              :             }
   15595           22 :           emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
   15596              :         }
   15597              : 
   15598          102 :       if (target == 0
   15599          102 :           || !register_operand (target, DImode))
   15600           10 :         target = gen_reg_rtx (DImode);
   15601              : 
   15602          102 :       if (TARGET_64BIT)
   15603              :         {
   15604          100 :           op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
   15605              :                                      op1, 1, OPTAB_DIRECT);
   15606          100 :           op0 = expand_simple_binop (DImode, IOR, op0, op1,
   15607              :                                      op0, 1, OPTAB_DIRECT);
   15608              :         }
   15609              : 
   15610          102 :       emit_move_insn (target, op0);
   15611          102 :       return target;
   15612              : 
   15613           61 :     case IX86_BUILTIN_ENQCMD:
   15614           61 :     case IX86_BUILTIN_ENQCMDS:
   15615           61 :     case IX86_BUILTIN_MOVDIR64B:
   15616              : 
   15617           61 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15618           61 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15619           61 :       op0 = expand_normal (arg0);
   15620           61 :       op1 = expand_normal (arg1);
   15621              : 
   15622           61 :       op0 = ix86_zero_extend_to_Pmode (op0);
   15623           61 :       if (!address_operand (op1, VOIDmode))
   15624              :       {
   15625           28 :         op1 = convert_memory_address (Pmode, op1);
   15626           28 :         op1 = copy_addr_to_reg (op1);
   15627              :       }
   15628           61 :       op1 = gen_rtx_MEM (XImode, op1);
   15629              : 
   15630           61 :       if (fcode == IX86_BUILTIN_MOVDIR64B)
   15631              :         {
   15632           24 :           emit_insn (gen_movdir64b (Pmode, op0, op1));
   15633           23 :           return 0;
   15634              :         }
   15635              :       else
   15636              :         {
   15637           38 :           if (target == 0
   15638           38 :               || !register_operand (target, SImode))
   15639            0 :             target = gen_reg_rtx (SImode);
   15640              : 
   15641           38 :           emit_move_insn (target, const0_rtx);
   15642           38 :           target = gen_rtx_SUBREG (QImode, target, 0);
   15643              : 
   15644           19 :           int unspecv = (fcode == IX86_BUILTIN_ENQCMD
   15645           38 :                          ? UNSPECV_ENQCMD
   15646              :                          : UNSPECV_ENQCMDS);
   15647           38 :           icode = code_for_enqcmd (unspecv, Pmode);
   15648           38 :           emit_insn (GEN_FCN (icode) (op0, op1));
   15649              : 
   15650           38 :           emit_insn
   15651           38 :             (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   15652              :                           gen_rtx_fmt_ee (EQ, QImode,
   15653              :                                           gen_rtx_REG (CCZmode, FLAGS_REG),
   15654              :                                           const0_rtx)));
   15655           38 :           return SUBREG_REG (target);
   15656              :         }
   15657              : 
   15658        14775 :     case IX86_BUILTIN_FXSAVE:
   15659        14775 :     case IX86_BUILTIN_FXRSTOR:
   15660        14775 :     case IX86_BUILTIN_FXSAVE64:
   15661        14775 :     case IX86_BUILTIN_FXRSTOR64:
   15662        14775 :     case IX86_BUILTIN_FNSTENV:
   15663        14775 :     case IX86_BUILTIN_FLDENV:
   15664        14775 :       mode0 = BLKmode;
   15665        14775 :       switch (fcode)
   15666              :         {
   15667              :         case IX86_BUILTIN_FXSAVE:
   15668              :           icode = CODE_FOR_fxsave;
   15669              :           break;
   15670           19 :         case IX86_BUILTIN_FXRSTOR:
   15671           19 :           icode = CODE_FOR_fxrstor;
   15672           19 :           break;
   15673           23 :         case IX86_BUILTIN_FXSAVE64:
   15674           23 :           icode = CODE_FOR_fxsave64;
   15675           23 :           break;
   15676           21 :         case IX86_BUILTIN_FXRSTOR64:
   15677           21 :           icode = CODE_FOR_fxrstor64;
   15678           21 :           break;
   15679         7257 :         case IX86_BUILTIN_FNSTENV:
   15680         7257 :           icode = CODE_FOR_fnstenv;
   15681         7257 :           break;
   15682         7435 :         case IX86_BUILTIN_FLDENV:
   15683         7435 :           icode = CODE_FOR_fldenv;
   15684         7435 :           break;
   15685            0 :         default:
   15686            0 :           gcc_unreachable ();
   15687              :         }
   15688              : 
   15689        14775 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15690        14775 :       op0 = expand_normal (arg0);
   15691              : 
   15692        14775 :       if (!address_operand (op0, VOIDmode))
   15693              :         {
   15694           36 :           op0 = convert_memory_address (Pmode, op0);
   15695           36 :           op0 = copy_addr_to_reg (op0);
   15696              :         }
   15697        14775 :       op0 = gen_rtx_MEM (mode0, op0);
   15698              : 
   15699        14775 :       pat = GEN_FCN (icode) (op0);
   15700        14775 :       if (pat)
   15701        14775 :         emit_insn (pat);
   15702              :       return 0;
   15703              : 
   15704           21 :     case IX86_BUILTIN_XSETBV:
   15705           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15706           21 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15707           21 :       op0 = expand_normal (arg0);
   15708           21 :       op1 = expand_normal (arg1);
   15709              : 
   15710           21 :       if (!REG_P (op0))
   15711            1 :         op0 = copy_to_mode_reg (SImode, op0);
   15712              : 
   15713           21 :       op1 = force_reg (DImode, op1);
   15714              : 
   15715           21 :       if (TARGET_64BIT)
   15716              :         {
   15717           21 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   15718              :                                      NULL, 1, OPTAB_DIRECT);
   15719              : 
   15720           21 :           icode = CODE_FOR_xsetbv_rex64;
   15721              : 
   15722           21 :           op2 = gen_lowpart (SImode, op2);
   15723           21 :           op1 = gen_lowpart (SImode, op1);
   15724           21 :           pat = GEN_FCN (icode) (op0, op1, op2);
   15725              :         }
   15726              :       else
   15727              :         {
   15728            0 :           icode = CODE_FOR_xsetbv;
   15729              : 
   15730            0 :           pat = GEN_FCN (icode) (op0, op1);
   15731              :         }
   15732           21 :       if (pat)
   15733           21 :         emit_insn (pat);
   15734              :       return 0;
   15735              : 
   15736          232 :     case IX86_BUILTIN_XSAVE:
   15737          232 :     case IX86_BUILTIN_XRSTOR:
   15738          232 :     case IX86_BUILTIN_XSAVE64:
   15739          232 :     case IX86_BUILTIN_XRSTOR64:
   15740          232 :     case IX86_BUILTIN_XSAVEOPT:
   15741          232 :     case IX86_BUILTIN_XSAVEOPT64:
   15742          232 :     case IX86_BUILTIN_XSAVES:
   15743          232 :     case IX86_BUILTIN_XRSTORS:
   15744          232 :     case IX86_BUILTIN_XSAVES64:
   15745          232 :     case IX86_BUILTIN_XRSTORS64:
   15746          232 :     case IX86_BUILTIN_XSAVEC:
   15747          232 :     case IX86_BUILTIN_XSAVEC64:
   15748          232 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15749          232 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15750          232 :       op0 = expand_normal (arg0);
   15751          232 :       op1 = expand_normal (arg1);
   15752              : 
   15753          232 :       if (!address_operand (op0, VOIDmode))
   15754              :         {
   15755          108 :           op0 = convert_memory_address (Pmode, op0);
   15756          108 :           op0 = copy_addr_to_reg (op0);
   15757              :         }
   15758          232 :       op0 = gen_rtx_MEM (BLKmode, op0);
   15759              : 
   15760          232 :       op1 = force_reg (DImode, op1);
   15761              : 
   15762          232 :       if (TARGET_64BIT)
   15763              :         {
   15764          232 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   15765              :                                      NULL, 1, OPTAB_DIRECT);
   15766          232 :           switch (fcode)
   15767              :             {
   15768              :             case IX86_BUILTIN_XSAVE:
   15769              :               icode = CODE_FOR_xsave_rex64;
   15770              :               break;
   15771           19 :             case IX86_BUILTIN_XRSTOR:
   15772           19 :               icode = CODE_FOR_xrstor_rex64;
   15773           19 :               break;
   15774           21 :             case IX86_BUILTIN_XSAVE64:
   15775           21 :               icode = CODE_FOR_xsave64;
   15776           21 :               break;
   15777           21 :             case IX86_BUILTIN_XRSTOR64:
   15778           21 :               icode = CODE_FOR_xrstor64;
   15779           21 :               break;
   15780           19 :             case IX86_BUILTIN_XSAVEOPT:
   15781           19 :               icode = CODE_FOR_xsaveopt_rex64;
   15782           19 :               break;
   15783           19 :             case IX86_BUILTIN_XSAVEOPT64:
   15784           19 :               icode = CODE_FOR_xsaveopt64;
   15785           19 :               break;
   15786           19 :             case IX86_BUILTIN_XSAVES:
   15787           19 :               icode = CODE_FOR_xsaves_rex64;
   15788           19 :               break;
   15789           19 :             case IX86_BUILTIN_XRSTORS:
   15790           19 :               icode = CODE_FOR_xrstors_rex64;
   15791           19 :               break;
   15792           19 :             case IX86_BUILTIN_XSAVES64:
   15793           19 :               icode = CODE_FOR_xsaves64;
   15794           19 :               break;
   15795           19 :             case IX86_BUILTIN_XRSTORS64:
   15796           19 :               icode = CODE_FOR_xrstors64;
   15797           19 :               break;
   15798           19 :             case IX86_BUILTIN_XSAVEC:
   15799           19 :               icode = CODE_FOR_xsavec_rex64;
   15800           19 :               break;
   15801           19 :             case IX86_BUILTIN_XSAVEC64:
   15802           19 :               icode = CODE_FOR_xsavec64;
   15803           19 :               break;
   15804            0 :             default:
   15805            0 :               gcc_unreachable ();
   15806              :             }
   15807              : 
   15808          232 :           op2 = gen_lowpart (SImode, op2);
   15809          232 :           op1 = gen_lowpart (SImode, op1);
   15810          232 :           pat = GEN_FCN (icode) (op0, op1, op2);
   15811              :         }
   15812              :       else
   15813              :         {
   15814            0 :           switch (fcode)
   15815              :             {
   15816              :             case IX86_BUILTIN_XSAVE:
   15817              :               icode = CODE_FOR_xsave;
   15818              :               break;
   15819              :             case IX86_BUILTIN_XRSTOR:
   15820              :               icode = CODE_FOR_xrstor;
   15821              :               break;
   15822              :             case IX86_BUILTIN_XSAVEOPT:
   15823              :               icode = CODE_FOR_xsaveopt;
   15824              :               break;
   15825              :             case IX86_BUILTIN_XSAVES:
   15826              :               icode = CODE_FOR_xsaves;
   15827              :               break;
   15828              :             case IX86_BUILTIN_XRSTORS:
   15829              :               icode = CODE_FOR_xrstors;
   15830              :               break;
   15831              :             case IX86_BUILTIN_XSAVEC:
   15832              :               icode = CODE_FOR_xsavec;
   15833              :               break;
   15834            0 :             default:
   15835            0 :               gcc_unreachable ();
   15836              :             }
   15837            0 :           pat = GEN_FCN (icode) (op0, op1);
   15838              :         }
   15839              : 
   15840          232 :       if (pat)
   15841          232 :         emit_insn (pat);
   15842              :       return 0;
   15843              : 
   15844          144 :     case IX86_BUILTIN_LDTILECFG:
   15845          144 :     case IX86_BUILTIN_STTILECFG:
   15846          144 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15847          144 :       op0 = expand_normal (arg0);
   15848              : 
   15849          144 :       if (!address_operand (op0, VOIDmode))
   15850              :         {
   15851            8 :           op0 = convert_memory_address (Pmode, op0);
   15852            8 :           op0 = copy_addr_to_reg (op0);
   15853              :         }
   15854          144 :       op0 = gen_rtx_MEM (BLKmode, op0);
   15855          144 :       if (fcode == IX86_BUILTIN_LDTILECFG)
   15856              :         icode = CODE_FOR_ldtilecfg;
   15857              :       else
   15858           93 :         icode = CODE_FOR_sttilecfg;
   15859          144 :       pat = GEN_FCN (icode) (op0);
   15860          144 :       emit_insn (pat);
   15861          144 :       return 0;
   15862              : 
   15863           18 :     case IX86_BUILTIN_LLWPCB:
   15864           18 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15865           18 :       op0 = expand_normal (arg0);
   15866              : 
   15867           18 :       if (!register_operand (op0, Pmode))
   15868            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   15869           18 :       emit_insn (gen_lwp_llwpcb (Pmode, op0));
   15870           18 :       return 0;
   15871              : 
   15872           18 :     case IX86_BUILTIN_SLWPCB:
   15873           18 :       if (!target
   15874           18 :           || !register_operand (target, Pmode))
   15875            0 :         target = gen_reg_rtx (Pmode);
   15876           18 :       emit_insn (gen_lwp_slwpcb (Pmode, target));
   15877           18 :       return target;
   15878              : 
   15879           51 :     case IX86_BUILTIN_LWPVAL32:
   15880           51 :     case IX86_BUILTIN_LWPVAL64:
   15881           51 :     case IX86_BUILTIN_LWPINS32:
   15882           51 :     case IX86_BUILTIN_LWPINS64:
   15883           51 :       mode = ((fcode == IX86_BUILTIN_LWPVAL32
   15884           51 :                || fcode == IX86_BUILTIN_LWPINS32)
   15885           51 :               ? SImode : DImode);
   15886              : 
   15887           51 :       if (fcode == IX86_BUILTIN_LWPVAL32
   15888           51 :           || fcode == IX86_BUILTIN_LWPVAL64)
   15889           26 :         icode = code_for_lwp_lwpval (mode);
   15890              :       else
   15891           25 :         icode = code_for_lwp_lwpins (mode);
   15892              : 
   15893           51 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15894           51 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15895           51 :       arg2 = CALL_EXPR_ARG (exp, 2);
   15896           51 :       op0 = expand_normal (arg0);
   15897           51 :       op1 = expand_normal (arg1);
   15898           51 :       op2 = expand_normal (arg2);
   15899           51 :       mode0 = insn_data[icode].operand[0].mode;
   15900              : 
   15901           51 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   15902           13 :         op0 = copy_to_mode_reg (mode0, op0);
   15903           51 :       if (!insn_data[icode].operand[1].predicate (op1, SImode))
   15904            0 :         op1 = copy_to_mode_reg (SImode, op1);
   15905              : 
   15906           51 :       if (!CONST_INT_P (op2))
   15907              :         {
   15908            0 :           error ("the last argument must be a 32-bit immediate");
   15909            0 :           return const0_rtx;
   15910              :         }
   15911              : 
   15912           51 :       emit_insn (GEN_FCN (icode) (op0, op1, op2));
   15913              : 
   15914           51 :       if (fcode == IX86_BUILTIN_LWPINS32
   15915           51 :           || fcode == IX86_BUILTIN_LWPINS64)
   15916              :         {
   15917           25 :           if (target == 0
   15918           25 :               || !nonimmediate_operand (target, QImode))
   15919            0 :             target = gen_reg_rtx (QImode);
   15920              : 
   15921           25 :           pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   15922              :                             const0_rtx);
   15923           25 :           emit_insn (gen_rtx_SET (target, pat));
   15924              : 
   15925           25 :           return target;
   15926              :         }
   15927              :       else
   15928              :         return 0;
   15929              : 
   15930           18 :     case IX86_BUILTIN_BEXTRI32:
   15931           18 :     case IX86_BUILTIN_BEXTRI64:
   15932           18 :       mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
   15933              : 
   15934           18 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15935           18 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15936           18 :       op0 = expand_normal (arg0);
   15937           18 :       op1 = expand_normal (arg1);
   15938              : 
   15939           18 :       if (!CONST_INT_P (op1))
   15940              :         {
   15941            0 :           error ("last argument must be an immediate");
   15942            0 :           return const0_rtx;
   15943              :         }
   15944              :       else
   15945              :         {
   15946           18 :           unsigned char lsb_index = UINTVAL (op1);
   15947           18 :           unsigned char length = UINTVAL (op1) >> 8;
   15948              : 
   15949           18 :           unsigned char bitsize = GET_MODE_BITSIZE (mode);
   15950              : 
   15951           18 :           icode = code_for_tbm_bextri (mode);
   15952              : 
   15953           18 :           mode1 = insn_data[icode].operand[1].mode;
   15954           18 :           if (!insn_data[icode].operand[1].predicate (op0, mode1))
   15955           12 :             op0 = copy_to_mode_reg (mode1, op0);
   15956              : 
   15957           18 :           mode0 = insn_data[icode].operand[0].mode;
   15958           18 :           if (target == 0
   15959           18 :               || !register_operand (target, mode0))
   15960            0 :             target = gen_reg_rtx (mode0);
   15961              : 
   15962           18 :           if (length == 0 || lsb_index >= bitsize)
   15963              :             {
   15964            8 :               emit_move_insn (target, const0_rtx);
   15965            8 :               return target;
   15966              :             }
   15967              : 
   15968           10 :           if (length + lsb_index > bitsize)
   15969            5 :             length = bitsize - lsb_index;
   15970              : 
   15971           10 :           op1 = GEN_INT (length);
   15972           10 :           op2 = GEN_INT (lsb_index);
   15973              : 
   15974           10 :           emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
   15975           10 :           return target;
   15976              :         }
   15977              : 
   15978           21 :     case IX86_BUILTIN_RDRAND16_STEP:
   15979           21 :       mode = HImode;
   15980           21 :       goto rdrand_step;
   15981              : 
   15982           42 :     case IX86_BUILTIN_RDRAND32_STEP:
   15983           42 :       mode = SImode;
   15984           42 :       goto rdrand_step;
   15985              : 
   15986              :     case IX86_BUILTIN_RDRAND64_STEP:
   15987              :       mode = DImode;
   15988              : 
   15989           83 : rdrand_step:
   15990           83 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15991           83 :       op1 = expand_normal (arg0);
   15992           83 :       if (!address_operand (op1, VOIDmode))
   15993              :         {
   15994           29 :           op1 = convert_memory_address (Pmode, op1);
   15995           29 :           op1 = copy_addr_to_reg (op1);
   15996              :         }
   15997              : 
   15998           83 :       op0 = gen_reg_rtx (mode);
   15999           83 :       emit_insn (gen_rdrand (mode, op0));
   16000              : 
   16001           83 :       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
   16002              : 
   16003           83 :       op1 = force_reg (SImode, const1_rtx);
   16004              : 
   16005              :       /* Emit SImode conditional move.  */
   16006           83 :       if (mode == HImode)
   16007              :         {
   16008           21 :           if (TARGET_ZERO_EXTEND_WITH_AND
   16009           21 :               && optimize_function_for_speed_p (cfun))
   16010              :             {
   16011            0 :               op2 = force_reg (SImode, const0_rtx);
   16012              : 
   16013            0 :               emit_insn (gen_movstricthi
   16014            0 :                          (gen_lowpart (HImode, op2), op0));
   16015              :             }
   16016              :           else
   16017              :             {
   16018           21 :               op2 = gen_reg_rtx (SImode);
   16019              : 
   16020           21 :               emit_insn (gen_zero_extendhisi2 (op2, op0));
   16021              :             }
   16022              :         }
   16023           62 :       else if (mode == SImode)
   16024              :         op2 = op0;
   16025              :       else
   16026           20 :         op2 = gen_rtx_SUBREG (SImode, op0, 0);
   16027              : 
   16028           83 :       if (target == 0
   16029           83 :           || !register_operand (target, SImode))
   16030            7 :         target = gen_reg_rtx (SImode);
   16031              : 
   16032           83 :       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
   16033              :                          const0_rtx);
   16034           83 :       emit_insn (gen_rtx_SET (target,
   16035              :                               gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
   16036           83 :       return target;
   16037              : 
   16038           19 :     case IX86_BUILTIN_RDSEED16_STEP:
   16039           19 :       mode = HImode;
   16040           19 :       goto rdseed_step;
   16041              : 
   16042           28 :     case IX86_BUILTIN_RDSEED32_STEP:
   16043           28 :       mode = SImode;
   16044           28 :       goto rdseed_step;
   16045              : 
   16046              :     case IX86_BUILTIN_RDSEED64_STEP:
   16047              :       mode = DImode;
   16048              : 
   16049           66 : rdseed_step:
   16050           66 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16051           66 :       op1 = expand_normal (arg0);
   16052           66 :       if (!address_operand (op1, VOIDmode))
   16053              :         {
   16054           28 :           op1 = convert_memory_address (Pmode, op1);
   16055           28 :           op1 = copy_addr_to_reg (op1);
   16056              :         }
   16057              : 
   16058           66 :       op0 = gen_reg_rtx (mode);
   16059           66 :       emit_insn (gen_rdseed (mode, op0));
   16060              : 
   16061           66 :       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
   16062              : 
   16063           66 :       op2 = gen_reg_rtx (QImode);
   16064              : 
   16065           66 :       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   16066              :                          const0_rtx);
   16067           66 :       emit_insn (gen_rtx_SET (op2, pat));
   16068              : 
   16069           66 :       if (target == 0
   16070           66 :           || !register_operand (target, SImode))
   16071            1 :         target = gen_reg_rtx (SImode);
   16072              : 
   16073           66 :       emit_insn (gen_zero_extendqisi2 (target, op2));
   16074           66 :       return target;
   16075              : 
   16076           38 :     case IX86_BUILTIN_SBB32:
   16077           38 :       icode = CODE_FOR_subborrowsi;
   16078           38 :       icode2 = CODE_FOR_subborrowsi_0;
   16079           38 :       mode0 = SImode;
   16080           38 :       mode1 = DImode;
   16081           38 :       mode2 = CCmode;
   16082           38 :       goto handlecarry;
   16083              : 
   16084           44 :     case IX86_BUILTIN_SBB64:
   16085           44 :       icode = CODE_FOR_subborrowdi;
   16086           44 :       icode2 = CODE_FOR_subborrowdi_0;
   16087           44 :       mode0 = DImode;
   16088           44 :       mode1 = TImode;
   16089           44 :       mode2 = CCmode;
   16090           44 :       goto handlecarry;
   16091              : 
   16092           68 :     case IX86_BUILTIN_ADDCARRYX32:
   16093           68 :       icode = CODE_FOR_addcarrysi;
   16094           68 :       icode2 = CODE_FOR_addcarrysi_0;
   16095           68 :       mode0 = SImode;
   16096           68 :       mode1 = DImode;
   16097           68 :       mode2 = CCCmode;
   16098           68 :       goto handlecarry;
   16099              : 
   16100              :     case IX86_BUILTIN_ADDCARRYX64:
   16101              :       icode = CODE_FOR_addcarrydi;
   16102              :       icode2 = CODE_FOR_addcarrydi_0;
   16103              :       mode0 = DImode;
   16104              :       mode1 = TImode;
   16105              :       mode2 = CCCmode;
   16106              : 
   16107          212 :     handlecarry:
   16108          212 :       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
   16109          212 :       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
   16110          212 :       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
   16111          212 :       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
   16112              : 
   16113          212 :       op1 = expand_normal (arg0);
   16114              : 
   16115          212 :       op2 = expand_normal (arg1);
   16116          212 :       if (!register_operand (op2, mode0))
   16117          117 :         op2 = copy_to_mode_reg (mode0, op2);
   16118              : 
   16119          212 :       op3 = expand_normal (arg2);
   16120          212 :       if (!register_operand (op3, mode0))
   16121          120 :         op3 = copy_to_mode_reg (mode0, op3);
   16122              : 
   16123          212 :       op4 = expand_normal (arg3);
   16124          212 :       if (!address_operand (op4, VOIDmode))
   16125              :         {
   16126           67 :           op4 = convert_memory_address (Pmode, op4);
   16127           67 :           op4 = copy_addr_to_reg (op4);
   16128              :         }
   16129              : 
   16130          212 :       op0 = gen_reg_rtx (mode0);
   16131          212 :       if (op1 == const0_rtx)
   16132              :         {
   16133              :           /* If arg0 is 0, optimize right away into add or sub
   16134              :              instruction that sets CCCmode flags.  */
   16135           21 :           op1 = gen_rtx_REG (mode2, FLAGS_REG);
   16136           21 :           emit_insn (GEN_FCN (icode2) (op0, op2, op3));
   16137              :         }
   16138              :       else
   16139              :         {
   16140              :           /* Generate CF from input operand.  */
   16141          191 :           ix86_expand_carry (op1);
   16142              : 
   16143              :           /* Generate instruction that consumes CF.  */
   16144          191 :           op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
   16145          191 :           pat = gen_rtx_LTU (mode1, op1, const0_rtx);
   16146          191 :           pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
   16147          191 :           emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
   16148              :         }
   16149              : 
   16150              :       /* Return current CF value.  */
   16151          212 :       if (target == 0)
   16152           14 :         target = gen_reg_rtx (QImode);
   16153              : 
   16154          212 :       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
   16155          212 :       emit_insn (gen_rtx_SET (target, pat));
   16156              : 
   16157              :       /* Store the result.  */
   16158          212 :       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
   16159              : 
   16160          212 :       return target;
   16161              : 
   16162           24 :     case IX86_BUILTIN_READ_FLAGS:
   16163           24 :       if (ignore)
   16164            1 :         return const0_rtx;
   16165              : 
   16166           23 :       emit_insn (gen_pushfl ());
   16167              : 
   16168           23 :       if (optimize
   16169           11 :           || target == NULL_RTX
   16170           11 :           || !nonimmediate_operand (target, word_mode)
   16171           34 :           || GET_MODE (target) != word_mode)
   16172           12 :         target = gen_reg_rtx (word_mode);
   16173              : 
   16174           23 :       emit_insn (gen_pop (target));
   16175           23 :       return target;
   16176              : 
   16177           21 :     case IX86_BUILTIN_WRITE_FLAGS:
   16178              : 
   16179           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16180           21 :       op0 = expand_normal (arg0);
   16181           21 :       if (!general_no_elim_operand (op0, word_mode))
   16182            0 :         op0 = copy_to_mode_reg (word_mode, op0);
   16183              : 
   16184           21 :       emit_insn (gen_push (op0));
   16185           21 :       emit_insn (gen_popfl ());
   16186           21 :       return 0;
   16187              : 
   16188           22 :     case IX86_BUILTIN_KTESTC8:
   16189           22 :       icode = CODE_FOR_ktestqi;
   16190           22 :       mode3 = CCCmode;
   16191           22 :       goto kortest;
   16192              : 
   16193           22 :     case IX86_BUILTIN_KTESTZ8:
   16194           22 :       icode = CODE_FOR_ktestqi;
   16195           22 :       mode3 = CCZmode;
   16196           22 :       goto kortest;
   16197              : 
   16198           22 :     case IX86_BUILTIN_KTESTC16:
   16199           22 :       icode = CODE_FOR_ktesthi;
   16200           22 :       mode3 = CCCmode;
   16201           22 :       goto kortest;
   16202              : 
   16203           22 :     case IX86_BUILTIN_KTESTZ16:
   16204           22 :       icode = CODE_FOR_ktesthi;
   16205           22 :       mode3 = CCZmode;
   16206           22 :       goto kortest;
   16207              : 
   16208           22 :     case IX86_BUILTIN_KTESTC32:
   16209           22 :       icode = CODE_FOR_ktestsi;
   16210           22 :       mode3 = CCCmode;
   16211           22 :       goto kortest;
   16212              : 
   16213           22 :     case IX86_BUILTIN_KTESTZ32:
   16214           22 :       icode = CODE_FOR_ktestsi;
   16215           22 :       mode3 = CCZmode;
   16216           22 :       goto kortest;
   16217              : 
   16218           22 :     case IX86_BUILTIN_KTESTC64:
   16219           22 :       icode = CODE_FOR_ktestdi;
   16220           22 :       mode3 = CCCmode;
   16221           22 :       goto kortest;
   16222              : 
   16223           22 :     case IX86_BUILTIN_KTESTZ64:
   16224           22 :       icode = CODE_FOR_ktestdi;
   16225           22 :       mode3 = CCZmode;
   16226           22 :       goto kortest;
   16227              : 
   16228           22 :     case IX86_BUILTIN_KORTESTC8:
   16229           22 :       icode = CODE_FOR_kortestqi;
   16230           22 :       mode3 = CCCmode;
   16231           22 :       goto kortest;
   16232              : 
   16233           76 :     case IX86_BUILTIN_KORTESTZ8:
   16234           76 :       icode = CODE_FOR_kortestqi;
   16235           76 :       mode3 = CCZmode;
   16236           76 :       goto kortest;
   16237              : 
   16238           38 :     case IX86_BUILTIN_KORTESTC16:
   16239           38 :       icode = CODE_FOR_kortesthi;
   16240           38 :       mode3 = CCCmode;
   16241           38 :       goto kortest;
   16242              : 
   16243           91 :     case IX86_BUILTIN_KORTESTZ16:
   16244           91 :       icode = CODE_FOR_kortesthi;
   16245           91 :       mode3 = CCZmode;
   16246           91 :       goto kortest;
   16247              : 
   16248           22 :     case IX86_BUILTIN_KORTESTC32:
   16249           22 :       icode = CODE_FOR_kortestsi;
   16250           22 :       mode3 = CCCmode;
   16251           22 :       goto kortest;
   16252              : 
   16253           79 :     case IX86_BUILTIN_KORTESTZ32:
   16254           79 :       icode = CODE_FOR_kortestsi;
   16255           79 :       mode3 = CCZmode;
   16256           79 :       goto kortest;
   16257              : 
   16258           22 :     case IX86_BUILTIN_KORTESTC64:
   16259           22 :       icode = CODE_FOR_kortestdi;
   16260           22 :       mode3 = CCCmode;
   16261           22 :       goto kortest;
   16262              : 
   16263              :     case IX86_BUILTIN_KORTESTZ64:
   16264              :       icode = CODE_FOR_kortestdi;
   16265              :       mode3 = CCZmode;
   16266              : 
   16267          610 :     kortest:
   16268          610 :       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
   16269          610 :       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
   16270          610 :       op0 = expand_normal (arg0);
   16271          610 :       op1 = expand_normal (arg1);
   16272              : 
   16273          610 :       mode0 = insn_data[icode].operand[0].mode;
   16274          610 :       mode1 = insn_data[icode].operand[1].mode;
   16275              : 
   16276          610 :       if (GET_MODE (op0) != VOIDmode)
   16277          610 :         op0 = force_reg (GET_MODE (op0), op0);
   16278              : 
   16279          610 :       op0 = gen_lowpart (mode0, op0);
   16280              : 
   16281          610 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   16282            0 :         op0 = copy_to_mode_reg (mode0, op0);
   16283              : 
   16284          610 :       if (GET_MODE (op1) != VOIDmode)
   16285          609 :         op1 = force_reg (GET_MODE (op1), op1);
   16286              : 
   16287          610 :       op1 = gen_lowpart (mode1, op1);
   16288              : 
   16289          610 :       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   16290            1 :         op1 = copy_to_mode_reg (mode1, op1);
   16291              : 
   16292          610 :       target = gen_reg_rtx (QImode);
   16293              : 
   16294              :       /* Emit kortest.  */
   16295          610 :       emit_insn (GEN_FCN (icode) (op0, op1));
   16296              :       /* And use setcc to return result from flags.  */
   16297          610 :       ix86_expand_setcc (target, EQ,
   16298              :                          gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
   16299          610 :       return target;
   16300              : 
   16301           24 :     case IX86_BUILTIN_GATHERSIV2DF:
   16302           24 :       icode = CODE_FOR_avx2_gathersiv2df;
   16303           24 :       goto gather_gen;
   16304           18 :     case IX86_BUILTIN_GATHERSIV4DF:
   16305           18 :       icode = CODE_FOR_avx2_gathersiv4df;
   16306           18 :       goto gather_gen;
   16307           21 :     case IX86_BUILTIN_GATHERDIV2DF:
   16308           21 :       icode = CODE_FOR_avx2_gatherdiv2df;
   16309           21 :       goto gather_gen;
   16310           32 :     case IX86_BUILTIN_GATHERDIV4DF:
   16311           32 :       icode = CODE_FOR_avx2_gatherdiv4df;
   16312           32 :       goto gather_gen;
   16313           30 :     case IX86_BUILTIN_GATHERSIV4SF:
   16314           30 :       icode = CODE_FOR_avx2_gathersiv4sf;
   16315           30 :       goto gather_gen;
   16316           37 :     case IX86_BUILTIN_GATHERSIV8SF:
   16317           37 :       icode = CODE_FOR_avx2_gathersiv8sf;
   16318           37 :       goto gather_gen;
   16319           24 :     case IX86_BUILTIN_GATHERDIV4SF:
   16320           24 :       icode = CODE_FOR_avx2_gatherdiv4sf;
   16321           24 :       goto gather_gen;
   16322           18 :     case IX86_BUILTIN_GATHERDIV8SF:
   16323           18 :       icode = CODE_FOR_avx2_gatherdiv8sf;
   16324           18 :       goto gather_gen;
   16325           18 :     case IX86_BUILTIN_GATHERSIV2DI:
   16326           18 :       icode = CODE_FOR_avx2_gathersiv2di;
   16327           18 :       goto gather_gen;
   16328           18 :     case IX86_BUILTIN_GATHERSIV4DI:
   16329           18 :       icode = CODE_FOR_avx2_gathersiv4di;
   16330           18 :       goto gather_gen;
   16331           27 :     case IX86_BUILTIN_GATHERDIV2DI:
   16332           27 :       icode = CODE_FOR_avx2_gatherdiv2di;
   16333           27 :       goto gather_gen;
   16334           29 :     case IX86_BUILTIN_GATHERDIV4DI:
   16335           29 :       icode = CODE_FOR_avx2_gatherdiv4di;
   16336           29 :       goto gather_gen;
   16337           20 :     case IX86_BUILTIN_GATHERSIV4SI:
   16338           20 :       icode = CODE_FOR_avx2_gathersiv4si;
   16339           20 :       goto gather_gen;
   16340           22 :     case IX86_BUILTIN_GATHERSIV8SI:
   16341           22 :       icode = CODE_FOR_avx2_gathersiv8si;
   16342           22 :       goto gather_gen;
   16343           28 :     case IX86_BUILTIN_GATHERDIV4SI:
   16344           28 :       icode = CODE_FOR_avx2_gatherdiv4si;
   16345           28 :       goto gather_gen;
   16346           18 :     case IX86_BUILTIN_GATHERDIV8SI:
   16347           18 :       icode = CODE_FOR_avx2_gatherdiv8si;
   16348           18 :       goto gather_gen;
   16349           20 :     case IX86_BUILTIN_GATHERALTSIV4DF:
   16350           20 :       icode = CODE_FOR_avx2_gathersiv4df;
   16351           20 :       goto gather_gen;
   16352           16 :     case IX86_BUILTIN_GATHERALTDIV8SF:
   16353           16 :       icode = CODE_FOR_avx2_gatherdiv8sf;
   16354           16 :       goto gather_gen;
   16355            4 :     case IX86_BUILTIN_GATHERALTSIV4DI:
   16356            4 :       icode = CODE_FOR_avx2_gathersiv4di;
   16357            4 :       goto gather_gen;
   16358           12 :     case IX86_BUILTIN_GATHERALTDIV8SI:
   16359           12 :       icode = CODE_FOR_avx2_gatherdiv8si;
   16360           12 :       goto gather_gen;
   16361           36 :     case IX86_BUILTIN_GATHER3SIV16SF:
   16362           36 :       icode = CODE_FOR_avx512f_gathersiv16sf;
   16363           36 :       goto gather_gen;
   16364           24 :     case IX86_BUILTIN_GATHER3SIV8DF:
   16365           24 :       icode = CODE_FOR_avx512f_gathersiv8df;
   16366           24 :       goto gather_gen;
   16367           24 :     case IX86_BUILTIN_GATHER3DIV16SF:
   16368           24 :       icode = CODE_FOR_avx512f_gatherdiv16sf;
   16369           24 :       goto gather_gen;
   16370           37 :     case IX86_BUILTIN_GATHER3DIV8DF:
   16371           37 :       icode = CODE_FOR_avx512f_gatherdiv8df;
   16372           37 :       goto gather_gen;
   16373           30 :     case IX86_BUILTIN_GATHER3SIV16SI:
   16374           30 :       icode = CODE_FOR_avx512f_gathersiv16si;
   16375           30 :       goto gather_gen;
   16376           24 :     case IX86_BUILTIN_GATHER3SIV8DI:
   16377           24 :       icode = CODE_FOR_avx512f_gathersiv8di;
   16378           24 :       goto gather_gen;
   16379           24 :     case IX86_BUILTIN_GATHER3DIV16SI:
   16380           24 :       icode = CODE_FOR_avx512f_gatherdiv16si;
   16381           24 :       goto gather_gen;
   16382           37 :     case IX86_BUILTIN_GATHER3DIV8DI:
   16383           37 :       icode = CODE_FOR_avx512f_gatherdiv8di;
   16384           37 :       goto gather_gen;
   16385           16 :     case IX86_BUILTIN_GATHER3ALTSIV8DF:
   16386           16 :       icode = CODE_FOR_avx512f_gathersiv8df;
   16387           16 :       goto gather_gen;
   16388           22 :     case IX86_BUILTIN_GATHER3ALTDIV16SF:
   16389           22 :       icode = CODE_FOR_avx512f_gatherdiv16sf;
   16390           22 :       goto gather_gen;
   16391           14 :     case IX86_BUILTIN_GATHER3ALTSIV8DI:
   16392           14 :       icode = CODE_FOR_avx512f_gathersiv8di;
   16393           14 :       goto gather_gen;
   16394           18 :     case IX86_BUILTIN_GATHER3ALTDIV16SI:
   16395           18 :       icode = CODE_FOR_avx512f_gatherdiv16si;
   16396           18 :       goto gather_gen;
   16397           18 :     case IX86_BUILTIN_GATHER3SIV2DF:
   16398           18 :       icode = CODE_FOR_avx512vl_gathersiv2df;
   16399           18 :       goto gather_gen;
   16400           10 :     case IX86_BUILTIN_GATHER3SIV4DF:
   16401           10 :       icode = CODE_FOR_avx512vl_gathersiv4df;
   16402           10 :       goto gather_gen;
   16403           15 :     case IX86_BUILTIN_GATHER3DIV2DF:
   16404           15 :       icode = CODE_FOR_avx512vl_gatherdiv2df;
   16405           15 :       goto gather_gen;
   16406           16 :     case IX86_BUILTIN_GATHER3DIV4DF:
   16407           16 :       icode = CODE_FOR_avx512vl_gatherdiv4df;
   16408           16 :       goto gather_gen;
   16409           14 :     case IX86_BUILTIN_GATHER3SIV4SF:
   16410           14 :       icode = CODE_FOR_avx512vl_gathersiv4sf;
   16411           14 :       goto gather_gen;
   16412           12 :     case IX86_BUILTIN_GATHER3SIV8SF:
   16413           12 :       icode = CODE_FOR_avx512vl_gathersiv8sf;
   16414           12 :       goto gather_gen;
   16415           22 :     case IX86_BUILTIN_GATHER3DIV4SF:
   16416           22 :       icode = CODE_FOR_avx512vl_gatherdiv4sf;
   16417           22 :       goto gather_gen;
   16418           10 :     case IX86_BUILTIN_GATHER3DIV8SF:
   16419           10 :       icode = CODE_FOR_avx512vl_gatherdiv8sf;
   16420           10 :       goto gather_gen;
   16421           20 :     case IX86_BUILTIN_GATHER3SIV2DI:
   16422           20 :       icode = CODE_FOR_avx512vl_gathersiv2di;
   16423           20 :       goto gather_gen;
   16424           10 :     case IX86_BUILTIN_GATHER3SIV4DI:
   16425           10 :       icode = CODE_FOR_avx512vl_gathersiv4di;
   16426           10 :       goto gather_gen;
   16427           14 :     case IX86_BUILTIN_GATHER3DIV2DI:
   16428           14 :       icode = CODE_FOR_avx512vl_gatherdiv2di;
   16429           14 :       goto gather_gen;
   16430           13 :     case IX86_BUILTIN_GATHER3DIV4DI:
   16431           13 :       icode = CODE_FOR_avx512vl_gatherdiv4di;
   16432           13 :       goto gather_gen;
   16433           14 :     case IX86_BUILTIN_GATHER3SIV4SI:
   16434           14 :       icode = CODE_FOR_avx512vl_gathersiv4si;
   16435           14 :       goto gather_gen;
   16436           12 :     case IX86_BUILTIN_GATHER3SIV8SI:
   16437           12 :       icode = CODE_FOR_avx512vl_gathersiv8si;
   16438           12 :       goto gather_gen;
   16439           24 :     case IX86_BUILTIN_GATHER3DIV4SI:
   16440           24 :       icode = CODE_FOR_avx512vl_gatherdiv4si;
   16441           24 :       goto gather_gen;
   16442           10 :     case IX86_BUILTIN_GATHER3DIV8SI:
   16443           10 :       icode = CODE_FOR_avx512vl_gatherdiv8si;
   16444           10 :       goto gather_gen;
   16445            4 :     case IX86_BUILTIN_GATHER3ALTSIV4DF:
   16446            4 :       icode = CODE_FOR_avx512vl_gathersiv4df;
   16447            4 :       goto gather_gen;
   16448            8 :     case IX86_BUILTIN_GATHER3ALTDIV8SF:
   16449            8 :       icode = CODE_FOR_avx512vl_gatherdiv8sf;
   16450            8 :       goto gather_gen;
   16451            6 :     case IX86_BUILTIN_GATHER3ALTSIV4DI:
   16452            6 :       icode = CODE_FOR_avx512vl_gathersiv4di;
   16453            6 :       goto gather_gen;
   16454           10 :     case IX86_BUILTIN_GATHER3ALTDIV8SI:
   16455           10 :       icode = CODE_FOR_avx512vl_gatherdiv8si;
   16456           10 :       goto gather_gen;
   16457           40 :     case IX86_BUILTIN_SCATTERSIV16SF:
   16458           40 :       icode = CODE_FOR_avx512f_scattersiv16sf;
   16459           40 :       goto scatter_gen;
   16460           27 :     case IX86_BUILTIN_SCATTERSIV8DF:
   16461           27 :       icode = CODE_FOR_avx512f_scattersiv8df;
   16462           27 :       goto scatter_gen;
   16463           24 :     case IX86_BUILTIN_SCATTERDIV16SF:
   16464           24 :       icode = CODE_FOR_avx512f_scatterdiv16sf;
   16465           24 :       goto scatter_gen;
   16466           33 :     case IX86_BUILTIN_SCATTERDIV8DF:
   16467           33 :       icode = CODE_FOR_avx512f_scatterdiv8df;
   16468           33 :       goto scatter_gen;
   16469           30 :     case IX86_BUILTIN_SCATTERSIV16SI:
   16470           30 :       icode = CODE_FOR_avx512f_scattersiv16si;
   16471           30 :       goto scatter_gen;
   16472           24 :     case IX86_BUILTIN_SCATTERSIV8DI:
   16473           24 :       icode = CODE_FOR_avx512f_scattersiv8di;
   16474           24 :       goto scatter_gen;
   16475           24 :     case IX86_BUILTIN_SCATTERDIV16SI:
   16476           24 :       icode = CODE_FOR_avx512f_scatterdiv16si;
   16477           24 :       goto scatter_gen;
   16478           29 :     case IX86_BUILTIN_SCATTERDIV8DI:
   16479           29 :       icode = CODE_FOR_avx512f_scatterdiv8di;
   16480           29 :       goto scatter_gen;
   16481           18 :     case IX86_BUILTIN_SCATTERSIV8SF:
   16482           18 :       icode = CODE_FOR_avx512vl_scattersiv8sf;
   16483           18 :       goto scatter_gen;
   16484           20 :     case IX86_BUILTIN_SCATTERSIV4SF:
   16485           20 :       icode = CODE_FOR_avx512vl_scattersiv4sf;
   16486           20 :       goto scatter_gen;
   16487           16 :     case IX86_BUILTIN_SCATTERSIV4DF:
   16488           16 :       icode = CODE_FOR_avx512vl_scattersiv4df;
   16489           16 :       goto scatter_gen;
   16490           16 :     case IX86_BUILTIN_SCATTERSIV2DF:
   16491           16 :       icode = CODE_FOR_avx512vl_scattersiv2df;
   16492           16 :       goto scatter_gen;
   16493           16 :     case IX86_BUILTIN_SCATTERDIV8SF:
   16494           16 :       icode = CODE_FOR_avx512vl_scatterdiv8sf;
   16495           16 :       goto scatter_gen;
   16496           16 :     case IX86_BUILTIN_SCATTERDIV4SF:
   16497           16 :       icode = CODE_FOR_avx512vl_scatterdiv4sf;
   16498           16 :       goto scatter_gen;
   16499           18 :     case IX86_BUILTIN_SCATTERDIV4DF:
   16500           18 :       icode = CODE_FOR_avx512vl_scatterdiv4df;
   16501           18 :       goto scatter_gen;
   16502           18 :     case IX86_BUILTIN_SCATTERDIV2DF:
   16503           18 :       icode = CODE_FOR_avx512vl_scatterdiv2df;
   16504           18 :       goto scatter_gen;
   16505           22 :     case IX86_BUILTIN_SCATTERSIV8SI:
   16506           22 :       icode = CODE_FOR_avx512vl_scattersiv8si;
   16507           22 :       goto scatter_gen;
   16508           24 :     case IX86_BUILTIN_SCATTERSIV4SI:
   16509           24 :       icode = CODE_FOR_avx512vl_scattersiv4si;
   16510           24 :       goto scatter_gen;
   16511           16 :     case IX86_BUILTIN_SCATTERSIV4DI:
   16512           16 :       icode = CODE_FOR_avx512vl_scattersiv4di;
   16513           16 :       goto scatter_gen;
   16514           16 :     case IX86_BUILTIN_SCATTERSIV2DI:
   16515           16 :       icode = CODE_FOR_avx512vl_scattersiv2di;
   16516           16 :       goto scatter_gen;
   16517           16 :     case IX86_BUILTIN_SCATTERDIV8SI:
   16518           16 :       icode = CODE_FOR_avx512vl_scatterdiv8si;
   16519           16 :       goto scatter_gen;
   16520           16 :     case IX86_BUILTIN_SCATTERDIV4SI:
   16521           16 :       icode = CODE_FOR_avx512vl_scatterdiv4si;
   16522           16 :       goto scatter_gen;
   16523           18 :     case IX86_BUILTIN_SCATTERDIV4DI:
   16524           18 :       icode = CODE_FOR_avx512vl_scatterdiv4di;
   16525           18 :       goto scatter_gen;
   16526           18 :     case IX86_BUILTIN_SCATTERDIV2DI:
   16527           18 :       icode = CODE_FOR_avx512vl_scatterdiv2di;
   16528           18 :       goto scatter_gen;
   16529           16 :     case IX86_BUILTIN_SCATTERALTSIV8DF:
   16530           16 :       icode = CODE_FOR_avx512f_scattersiv8df;
   16531           16 :       goto scatter_gen;
   16532           12 :     case IX86_BUILTIN_SCATTERALTDIV16SF:
   16533           12 :       icode = CODE_FOR_avx512f_scatterdiv16sf;
   16534           12 :       goto scatter_gen;
   16535            8 :     case IX86_BUILTIN_SCATTERALTSIV8DI:
   16536            8 :       icode = CODE_FOR_avx512f_scattersiv8di;
   16537            8 :       goto scatter_gen;
   16538           24 :     case IX86_BUILTIN_SCATTERALTDIV16SI:
   16539           24 :       icode = CODE_FOR_avx512f_scatterdiv16si;
   16540           24 :       goto scatter_gen;
   16541            4 :     case IX86_BUILTIN_SCATTERALTSIV4DF:
   16542            4 :       icode = CODE_FOR_avx512vl_scattersiv4df;
   16543            4 :       goto scatter_gen;
   16544            4 :     case IX86_BUILTIN_SCATTERALTDIV8SF:
   16545            4 :       icode = CODE_FOR_avx512vl_scatterdiv8sf;
   16546            4 :       goto scatter_gen;
   16547            4 :     case IX86_BUILTIN_SCATTERALTSIV4DI:
   16548            4 :       icode = CODE_FOR_avx512vl_scattersiv4di;
   16549            4 :       goto scatter_gen;
   16550            4 :     case IX86_BUILTIN_SCATTERALTDIV8SI:
   16551            4 :       icode = CODE_FOR_avx512vl_scatterdiv8si;
   16552            4 :       goto scatter_gen;
   16553            8 :     case IX86_BUILTIN_SCATTERALTSIV2DF:
   16554            8 :       icode = CODE_FOR_avx512vl_scattersiv2df;
   16555            8 :       goto scatter_gen;
   16556            8 :     case IX86_BUILTIN_SCATTERALTDIV4SF:
   16557            8 :       icode = CODE_FOR_avx512vl_scatterdiv4sf;
   16558            8 :       goto scatter_gen;
   16559            8 :     case IX86_BUILTIN_SCATTERALTSIV2DI:
   16560            8 :       icode = CODE_FOR_avx512vl_scattersiv2di;
   16561            8 :       goto scatter_gen;
   16562            8 :     case IX86_BUILTIN_SCATTERALTDIV4SI:
   16563            8 :       icode = CODE_FOR_avx512vl_scatterdiv4si;
   16564            8 :       goto scatter_gen;
   16565              : 
   16566         1004 :     gather_gen:
   16567         1004 :       rtx half;
   16568         1004 :       rtx (*gen) (rtx, rtx);
   16569              : 
   16570         1004 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16571         1004 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16572         1004 :       arg2 = CALL_EXPR_ARG (exp, 2);
   16573         1004 :       arg3 = CALL_EXPR_ARG (exp, 3);
   16574         1004 :       arg4 = CALL_EXPR_ARG (exp, 4);
   16575         1004 :       op0 = expand_normal (arg0);
   16576         1004 :       op1 = expand_normal (arg1);
   16577         1004 :       op2 = expand_normal (arg2);
   16578         1004 :       op3 = ix86_expand_unsigned_small_int_cst_argument (arg3);
   16579         1004 :       op4 = expand_normal (arg4);
   16580              :       /* Note the arg order is different from the operand order.  */
   16581         1004 :       mode0 = insn_data[icode].operand[1].mode;
   16582         1004 :       mode2 = insn_data[icode].operand[3].mode;
   16583         1004 :       mode3 = insn_data[icode].operand[4].mode;
   16584         1004 :       mode4 = insn_data[icode].operand[5].mode;
   16585              : 
   16586         1004 :       if (target == NULL_RTX
   16587         1004 :           || GET_MODE (target) != insn_data[icode].operand[0].mode
   16588         1904 :           || !insn_data[icode].operand[0].predicate (target,
   16589              :                                                      GET_MODE (target)))
   16590          105 :         subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
   16591              :       else
   16592              :         subtarget = target;
   16593              : 
   16594         1004 :       switch (fcode)
   16595              :         {
   16596           30 :         case IX86_BUILTIN_GATHER3ALTSIV8DF:
   16597           30 :         case IX86_BUILTIN_GATHER3ALTSIV8DI:
   16598           30 :           half = gen_reg_rtx (V8SImode);
   16599           30 :           if (!nonimmediate_operand (op2, V16SImode))
   16600            0 :             op2 = copy_to_mode_reg (V16SImode, op2);
   16601           30 :           emit_insn (gen_vec_extract_lo_v16si (half, op2));
   16602           30 :           op2 = half;
   16603           30 :           break;
   16604           34 :         case IX86_BUILTIN_GATHER3ALTSIV4DF:
   16605           34 :         case IX86_BUILTIN_GATHER3ALTSIV4DI:
   16606           34 :         case IX86_BUILTIN_GATHERALTSIV4DF:
   16607           34 :         case IX86_BUILTIN_GATHERALTSIV4DI:
   16608           34 :           half = gen_reg_rtx (V4SImode);
   16609           34 :           if (!nonimmediate_operand (op2, V8SImode))
   16610            0 :             op2 = copy_to_mode_reg (V8SImode, op2);
   16611           34 :           emit_insn (gen_vec_extract_lo_v8si (half, op2));
   16612           34 :           op2 = half;
   16613           34 :           break;
   16614           40 :         case IX86_BUILTIN_GATHER3ALTDIV16SF:
   16615           40 :         case IX86_BUILTIN_GATHER3ALTDIV16SI:
   16616           40 :           half = gen_reg_rtx (mode0);
   16617           40 :           if (mode0 == V8SFmode)
   16618              :             gen = gen_vec_extract_lo_v16sf;
   16619              :           else
   16620           18 :             gen = gen_vec_extract_lo_v16si;
   16621           40 :           if (!nonimmediate_operand (op0, GET_MODE (op0)))
   16622           40 :             op0 = copy_to_mode_reg (GET_MODE (op0), op0);
   16623           40 :           emit_insn (gen (half, op0));
   16624           40 :           op0 = half;
   16625           40 :           op3 = lowpart_subreg (QImode, op3, HImode);
   16626           40 :           break;
   16627           46 :         case IX86_BUILTIN_GATHER3ALTDIV8SF:
   16628           46 :         case IX86_BUILTIN_GATHER3ALTDIV8SI:
   16629           46 :         case IX86_BUILTIN_GATHERALTDIV8SF:
   16630           46 :         case IX86_BUILTIN_GATHERALTDIV8SI:
   16631           46 :           half = gen_reg_rtx (mode0);
   16632           46 :           if (mode0 == V4SFmode)
   16633              :             gen = gen_vec_extract_lo_v8sf;
   16634              :           else
   16635           22 :             gen = gen_vec_extract_lo_v8si;
   16636           46 :           if (!nonimmediate_operand (op0, GET_MODE (op0)))
   16637           46 :             op0 = copy_to_mode_reg (GET_MODE (op0), op0);
   16638           46 :           emit_insn (gen (half, op0));
   16639           46 :           op0 = half;
   16640           46 :           if (VECTOR_MODE_P (GET_MODE (op3)))
   16641              :             {
   16642           28 :               half = gen_reg_rtx (mode0);
   16643           28 :               if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16644           12 :                 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16645           28 :               emit_insn (gen (half, op3));
   16646           28 :               op3 = half;
   16647              :             }
   16648              :           break;
   16649              :         default:
   16650              :           break;
   16651              :         }
   16652              : 
   16653              :       /* Force memory operand only with base register here.  But we
   16654              :          don't want to do it on memory operand for other builtin
   16655              :          functions.  */
   16656         1004 :       op1 = ix86_zero_extend_to_Pmode (op1);
   16657              : 
   16658         1004 :       if (!insn_data[icode].operand[1].predicate (op0, mode0))
   16659          403 :         op0 = copy_to_mode_reg (mode0, op0);
   16660         1009 :       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
   16661            0 :         op1 = copy_to_mode_reg (Pmode, op1);
   16662         1004 :       if (!insn_data[icode].operand[3].predicate (op2, mode2))
   16663          221 :         op2 = copy_to_mode_reg (mode2, op2);
   16664              : 
   16665         1004 :       op3 = fixup_modeless_constant (op3, mode3);
   16666              : 
   16667         1004 :       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
   16668              :         {
   16669         1004 :           if (!insn_data[icode].operand[4].predicate (op3, mode3))
   16670          356 :             op3 = copy_to_mode_reg (mode3, op3);
   16671              :         }
   16672              :       else
   16673              :         {
   16674            0 :           op3 = copy_to_reg (op3);
   16675            0 :           op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
   16676              :         }
   16677         1004 :       if (!insn_data[icode].operand[5].predicate (op4, mode4))
   16678              :         {
   16679            0 :           error ("the last argument must be scale 1, 2, 4, 8");
   16680            0 :           return const0_rtx;
   16681              :         }
   16682              : 
   16683              :       /* Optimize.  If mask is known to have all high bits set,
   16684              :          replace op0 with pc_rtx to signal that the instruction
   16685              :          overwrites the whole destination and doesn't use its
   16686              :          previous contents.  */
   16687         1004 :       if (optimize)
   16688              :         {
   16689          914 :           if (TREE_CODE (arg3) == INTEGER_CST)
   16690              :             {
   16691          209 :               if (integer_all_onesp (arg3))
   16692          201 :                 op0 = pc_rtx;
   16693              :             }
   16694          705 :           else if (TREE_CODE (arg3) == VECTOR_CST)
   16695              :             {
   16696              :               unsigned int negative = 0;
   16697          755 :               for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
   16698              :                 {
   16699          620 :                   tree cst = VECTOR_CST_ELT (arg3, i);
   16700          620 :                   if (TREE_CODE (cst) == INTEGER_CST
   16701          620 :                       && tree_int_cst_sign_bit (cst))
   16702          286 :                     negative++;
   16703          334 :                   else if (TREE_CODE (cst) == REAL_CST
   16704          334 :                            && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
   16705          306 :                     negative++;
   16706              :                 }
   16707          135 :               if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
   16708          121 :                 op0 = pc_rtx;
   16709              :             }
   16710          570 :           else if (TREE_CODE (arg3) == SSA_NAME
   16711          570 :                    && VECTOR_TYPE_P (TREE_TYPE (arg3)))
   16712              :             {
   16713              :               /* Recognize also when mask is like:
   16714              :                  __v2df src = _mm_setzero_pd ();
   16715              :                  __v2df mask = _mm_cmpeq_pd (src, src);
   16716              :                  or
   16717              :                  __v8sf src = _mm256_setzero_ps ();
   16718              :                  __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
   16719              :                  as that is a cheaper way to load all ones into
   16720              :                  a register than having to load a constant from
   16721              :                  memory.  */
   16722          259 :               gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
   16723          259 :               if (is_gimple_call (def_stmt))
   16724              :                 {
   16725           76 :                   tree fndecl = gimple_call_fndecl (def_stmt);
   16726           76 :                   if (fndecl
   16727           76 :                       && fndecl_built_in_p (fndecl, BUILT_IN_MD))
   16728           67 :                     switch (DECL_MD_FUNCTION_CODE (fndecl))
   16729              :                       {
   16730           24 :                       case IX86_BUILTIN_CMPPD:
   16731           24 :                       case IX86_BUILTIN_CMPPS:
   16732           24 :                       case IX86_BUILTIN_CMPPD256:
   16733           24 :                       case IX86_BUILTIN_CMPPS256:
   16734           24 :                         if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
   16735              :                           break;
   16736              :                         /* FALLTHRU */
   16737           49 :                       case IX86_BUILTIN_CMPEQPD:
   16738           49 :                       case IX86_BUILTIN_CMPEQPS:
   16739           49 :                         if (initializer_zerop (gimple_call_arg (def_stmt, 0))
   16740           49 :                             && initializer_zerop (gimple_call_arg (def_stmt,
   16741              :                                                                    1)))
   16742           49 :                           op0 = pc_rtx;
   16743              :                         break;
   16744              :                       default:
   16745              :                         break;
   16746              :                       }
   16747              :                 }
   16748              :             }
   16749              :         }
   16750              : 
   16751         1004 :       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
   16752         1004 :       if (! pat)
   16753            0 :         return const0_rtx;
   16754         1004 :       emit_insn (pat);
   16755              : 
   16756         1004 :       switch (fcode)
   16757              :         {
   16758           24 :         case IX86_BUILTIN_GATHER3DIV16SF:
   16759           24 :           if (target == NULL_RTX)
   16760            0 :             target = gen_reg_rtx (V8SFmode);
   16761           24 :           emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
   16762           24 :           break;
   16763           24 :         case IX86_BUILTIN_GATHER3DIV16SI:
   16764           24 :           if (target == NULL_RTX)
   16765            0 :             target = gen_reg_rtx (V8SImode);
   16766           24 :           emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
   16767           24 :           break;
   16768           28 :         case IX86_BUILTIN_GATHER3DIV8SF:
   16769           28 :         case IX86_BUILTIN_GATHERDIV8SF:
   16770           28 :           if (target == NULL_RTX)
   16771            0 :             target = gen_reg_rtx (V4SFmode);
   16772           28 :           emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
   16773           28 :           break;
   16774           28 :         case IX86_BUILTIN_GATHER3DIV8SI:
   16775           28 :         case IX86_BUILTIN_GATHERDIV8SI:
   16776           28 :           if (target == NULL_RTX)
   16777            0 :             target = gen_reg_rtx (V4SImode);
   16778           28 :           emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
   16779           28 :           break;
   16780              :         default:
   16781              :           target = subtarget;
   16782              :           break;
   16783              :         }
   16784              :       return target;
   16785              : 
   16786          623 :     scatter_gen:
   16787          623 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16788          623 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16789          623 :       arg2 = CALL_EXPR_ARG (exp, 2);
   16790          623 :       arg3 = CALL_EXPR_ARG (exp, 3);
   16791          623 :       arg4 = CALL_EXPR_ARG (exp, 4);
   16792          623 :       op0 = expand_normal (arg0);
   16793          623 :       op1 = ix86_expand_unsigned_small_int_cst_argument (arg1);
   16794          623 :       op2 = expand_normal (arg2);
   16795          623 :       op3 = expand_normal (arg3);
   16796          623 :       op4 = expand_normal (arg4);
   16797          623 :       mode1 = insn_data[icode].operand[1].mode;
   16798          623 :       mode2 = insn_data[icode].operand[2].mode;
   16799          623 :       mode3 = insn_data[icode].operand[3].mode;
   16800          623 :       mode4 = insn_data[icode].operand[4].mode;
   16801              : 
   16802              :       /* Scatter instruction stores operand op3 to memory with
   16803              :          indices from op2 and scale from op4 under writemask op1.
   16804              :          If index operand op2 has more elements then source operand
   16805              :          op3 one need to use only its low half. And vice versa.  */
   16806          623 :       switch (fcode)
   16807              :         {
   16808           24 :         case IX86_BUILTIN_SCATTERALTSIV8DF:
   16809           24 :         case IX86_BUILTIN_SCATTERALTSIV8DI:
   16810           24 :           half = gen_reg_rtx (V8SImode);
   16811           24 :           if (!nonimmediate_operand (op2, V16SImode))
   16812            0 :             op2 = copy_to_mode_reg (V16SImode, op2);
   16813           24 :           emit_insn (gen_vec_extract_lo_v16si (half, op2));
   16814           24 :           op2 = half;
   16815           24 :           break;
   16816           36 :         case IX86_BUILTIN_SCATTERALTDIV16SF:
   16817           36 :         case IX86_BUILTIN_SCATTERALTDIV16SI:
   16818           36 :           half = gen_reg_rtx (mode3);
   16819           36 :           if (mode3 == V8SFmode)
   16820              :             gen = gen_vec_extract_lo_v16sf;
   16821              :           else
   16822           24 :             gen = gen_vec_extract_lo_v16si;
   16823           36 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16824            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16825           36 :           emit_insn (gen (half, op3));
   16826           36 :           op3 = half;
   16827           36 :           break;
   16828            8 :         case IX86_BUILTIN_SCATTERALTSIV4DF:
   16829            8 :         case IX86_BUILTIN_SCATTERALTSIV4DI:
   16830            8 :           half = gen_reg_rtx (V4SImode);
   16831            8 :           if (!nonimmediate_operand (op2, V8SImode))
   16832            0 :             op2 = copy_to_mode_reg (V8SImode, op2);
   16833            8 :           emit_insn (gen_vec_extract_lo_v8si (half, op2));
   16834            8 :           op2 = half;
   16835            8 :           break;
   16836            8 :         case IX86_BUILTIN_SCATTERALTDIV8SF:
   16837            8 :         case IX86_BUILTIN_SCATTERALTDIV8SI:
   16838            8 :           half = gen_reg_rtx (mode3);
   16839            8 :           if (mode3 == V4SFmode)
   16840              :             gen = gen_vec_extract_lo_v8sf;
   16841              :           else
   16842            4 :             gen = gen_vec_extract_lo_v8si;
   16843            8 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16844            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16845            8 :           emit_insn (gen (half, op3));
   16846            8 :           op3 = half;
   16847            8 :           break;
   16848           16 :         case IX86_BUILTIN_SCATTERALTSIV2DF:
   16849           16 :         case IX86_BUILTIN_SCATTERALTSIV2DI:
   16850           16 :           if (!nonimmediate_operand (op2, V4SImode))
   16851            0 :             op2 = copy_to_mode_reg (V4SImode, op2);
   16852              :           break;
   16853           16 :         case IX86_BUILTIN_SCATTERALTDIV4SF:
   16854           16 :         case IX86_BUILTIN_SCATTERALTDIV4SI:
   16855           16 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16856            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16857              :           break;
   16858              :         default:
   16859              :           break;
   16860              :         }
   16861              : 
   16862              :       /* Force memory operand only with base register here.  But we
   16863              :          don't want to do it on memory operand for other builtin
   16864              :          functions.  */
   16865          633 :       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
   16866              : 
   16867          628 :       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   16868            0 :         op0 = copy_to_mode_reg (Pmode, op0);
   16869              : 
   16870          623 :       op1 = fixup_modeless_constant (op1, mode1);
   16871              : 
   16872          623 :       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
   16873              :         {
   16874          607 :           if (!insn_data[icode].operand[1].predicate (op1, mode1))
   16875          273 :             op1 = copy_to_mode_reg (mode1, op1);
   16876              :         }
   16877              :       else
   16878              :         {
   16879           16 :           op1 = copy_to_reg (op1);
   16880           16 :           op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
   16881              :         }
   16882              : 
   16883          623 :       if (!insn_data[icode].operand[2].predicate (op2, mode2))
   16884           57 :         op2 = copy_to_mode_reg (mode2, op2);
   16885              : 
   16886          623 :       if (!insn_data[icode].operand[3].predicate (op3, mode3))
   16887           82 :         op3 = copy_to_mode_reg (mode3, op3);
   16888              : 
   16889          623 :       if (!insn_data[icode].operand[4].predicate (op4, mode4))
   16890              :         {
   16891            0 :           error ("the last argument must be scale 1, 2, 4, 8");
   16892            0 :           return const0_rtx;
   16893              :         }
   16894              : 
   16895          623 :       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
   16896          623 :       if (! pat)
   16897            0 :         return const0_rtx;
   16898              : 
   16899          623 :       emit_insn (pat);
   16900          623 :       return 0;
   16901              : 
   16902           23 :     case IX86_BUILTIN_XABORT:
   16903           23 :       icode = CODE_FOR_xabort;
   16904           23 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16905           23 :       op0 = expand_normal (arg0);
   16906           23 :       mode0 = insn_data[icode].operand[0].mode;
   16907           23 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   16908              :         {
   16909            0 :           error ("the argument to %<xabort%> intrinsic must "
   16910              :                  "be an 8-bit immediate");
   16911            0 :           return const0_rtx;
   16912              :         }
   16913           23 :       emit_insn (gen_xabort (op0));
   16914           23 :       return 0;
   16915              : 
   16916           55 :     case IX86_BUILTIN_RDSSPD:
   16917           55 :     case IX86_BUILTIN_RDSSPQ:
   16918           55 :       mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
   16919              : 
   16920           55 :       if (target == 0
   16921           55 :           || !register_operand (target, mode))
   16922            0 :         target = gen_reg_rtx (mode);
   16923              : 
   16924           55 :       op0 = force_reg (mode, const0_rtx);
   16925              : 
   16926           55 :       emit_insn (gen_rdssp (mode, target, op0));
   16927           55 :       return target;
   16928              : 
   16929           55 :     case IX86_BUILTIN_INCSSPD:
   16930           55 :     case IX86_BUILTIN_INCSSPQ:
   16931           55 :       mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
   16932              : 
   16933           55 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16934           55 :       op0 = expand_normal (arg0);
   16935              : 
   16936           55 :       op0 = force_reg (mode, op0);
   16937              : 
   16938           55 :       emit_insn (gen_incssp (mode, op0));
   16939           55 :       return 0;
   16940              : 
   16941           20 :     case IX86_BUILTIN_HRESET:
   16942           20 :       icode = CODE_FOR_hreset;
   16943           20 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16944           20 :       op0 = expand_normal (arg0);
   16945           20 :       op0 = force_reg (SImode, op0);
   16946           20 :       emit_insn (gen_hreset (op0));
   16947           20 :       return 0;
   16948              : 
   16949           38 :     case IX86_BUILTIN_RSTORSSP:
   16950           38 :     case IX86_BUILTIN_CLRSSBSY:
   16951           38 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16952           38 :       op0 = expand_normal (arg0);
   16953           19 :       icode = (fcode == IX86_BUILTIN_RSTORSSP
   16954           38 :                ? CODE_FOR_rstorssp
   16955              :                : CODE_FOR_clrssbsy);
   16956              : 
   16957           38 :       if (!address_operand (op0, VOIDmode))
   16958              :         {
   16959           18 :           op0 = convert_memory_address (Pmode, op0);
   16960           18 :           op0 = copy_addr_to_reg (op0);
   16961              :         }
   16962           38 :       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
   16963           38 :       return 0;
   16964              : 
   16965           80 :     case IX86_BUILTIN_WRSSD:
   16966           80 :     case IX86_BUILTIN_WRSSQ:
   16967           80 :     case IX86_BUILTIN_WRUSSD:
   16968           80 :     case IX86_BUILTIN_WRUSSQ:
   16969           80 :       mode = ((fcode == IX86_BUILTIN_WRSSD
   16970           80 :                || fcode == IX86_BUILTIN_WRUSSD)
   16971           80 :               ? SImode : DImode);
   16972              : 
   16973           80 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16974           80 :       op0 = expand_normal (arg0);
   16975           80 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16976           80 :       op1 = expand_normal (arg1);
   16977              : 
   16978           80 :       op0 = force_reg (mode, op0);
   16979              : 
   16980           80 :       if (!address_operand (op1, VOIDmode))
   16981              :         {
   16982           36 :           op1 = convert_memory_address (Pmode, op1);
   16983           36 :           op1 = copy_addr_to_reg (op1);
   16984              :         }
   16985           80 :       op1 = gen_rtx_MEM (mode, op1);
   16986              : 
   16987           80 :       icode = ((fcode == IX86_BUILTIN_WRSSD
   16988           80 :                 || fcode == IX86_BUILTIN_WRSSQ)
   16989           80 :                ? code_for_wrss (mode)
   16990           40 :                : code_for_wruss (mode));
   16991           80 :       emit_insn (GEN_FCN (icode) (op0, op1));
   16992              : 
   16993           80 :       return 0;
   16994              : 
   16995       116627 :     default:
   16996       116627 :       break;
   16997              :     }
   16998              : 
   16999       116627 :   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
   17000       116627 :       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
   17001              :     {
   17002        27053 :       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
   17003        27053 :       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
   17004        27053 :                                                target);
   17005              :     }
   17006              : 
   17007        89574 :   if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
   17008        89574 :       && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
   17009              :     {
   17010           93 :       i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
   17011           93 :       return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
   17012           93 :                                                target);
   17013              :     }
   17014              : 
   17015        89481 :   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
   17016        89481 :       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
   17017              :     {
   17018        71075 :       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
   17019              : 
   17020        71075 :       switch (fcode)
   17021              :         {
   17022            0 :           case IX86_BUILTIN_RDPID:
   17023            0 :             return ix86_expand_special_args_builtin (bdesc_args + i, exp,
   17024            0 :                                                      target);
   17025           74 :           case IX86_BUILTIN_VCOMISBF16EQ:
   17026           74 :           case IX86_BUILTIN_VCOMISBF16NE:
   17027           74 :           case IX86_BUILTIN_VCOMISBF16GT:
   17028           74 :           case IX86_BUILTIN_VCOMISBF16GE:
   17029           74 :           case IX86_BUILTIN_VCOMISBF16LT:
   17030           74 :           case IX86_BUILTIN_VCOMISBF16LE:
   17031           74 :             return ix86_expand_sse_comi (bdesc_args + i, exp, target, false);
   17032           15 :           case IX86_BUILTIN_FABSQ:
   17033           15 :           case IX86_BUILTIN_COPYSIGNQ:
   17034           15 :             if (!TARGET_SSE)
   17035              :               /* Emit a normal call if SSE isn't available.  */
   17036            0 :               return expand_call (exp, target, ignore);
   17037              :             /* FALLTHRU */
   17038        71001 :           default:
   17039        71001 :             return ix86_expand_args_builtin (bdesc_args + i, exp, target);
   17040              :           }
   17041              :     }
   17042              : 
   17043        18406 :   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
   17044        18406 :       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
   17045              :     {
   17046          473 :       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
   17047          473 :       return ix86_expand_sse_comi (bdesc_comi + i, exp, target, true);
   17048              :     }
   17049              : 
   17050        17933 :   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
   17051        17933 :       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
   17052              :     {
   17053        15589 :       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
   17054        15589 :       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
   17055              :     }
   17056              : 
   17057         2344 :   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
   17058         2344 :       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
   17059              :     {
   17060          216 :       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
   17061          216 :       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
   17062              :     }
   17063              : 
   17064         2128 :   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
   17065         2128 :       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
   17066              :     {
   17067          275 :       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
   17068          275 :       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
   17069              :     }
   17070              : 
   17071         1853 :   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
   17072         1853 :       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
   17073              :     {
   17074         1815 :       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
   17075         1815 :       const struct builtin_description *d = bdesc_multi_arg + i;
   17076         1815 :       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
   17077              :                                             (enum ix86_builtin_func_type)
   17078         1815 :                                             d->flag, d->comparison);
   17079              :     }
   17080              : 
   17081           38 :   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
   17082           38 :       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
   17083              :     {
   17084           38 :       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
   17085           38 :       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
   17086           38 :                                                target);
   17087              :     }
   17088              : 
   17089            0 :   gcc_unreachable ();
   17090              : }
   17091              : 
   17092              : /* See below where shifts are handled for explanation of this enum.  */
   17093              : enum ix86_vec_bcast_alg
   17094              : {
   17095              :   VEC_BCAST_PXOR,
   17096              :   VEC_BCAST_PCMPEQ,
   17097              :   VEC_BCAST_PABSB,
   17098              :   VEC_BCAST_PADDB,
   17099              :   VEC_BCAST_PSRLW,
   17100              :   VEC_BCAST_PSRLD,
   17101              :   VEC_BCAST_PSLLW,
   17102              :   VEC_BCAST_PSLLD
   17103              : };
   17104              : 
   17105              : struct ix86_vec_bcast_map_simode_t
   17106              : {
   17107              :   unsigned int key;
   17108              :   enum ix86_vec_bcast_alg alg;
   17109              :   unsigned int arg;
   17110              : };
   17111              : 
   17112              : /* This table must be kept sorted as values are looked-up using bsearch.  */
   17113              : static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = {
   17114              :   { 0x00000000, VEC_BCAST_PXOR,    0 },
   17115              :   { 0x00000001, VEC_BCAST_PSRLD,  31 },
   17116              :   { 0x00000003, VEC_BCAST_PSRLD,  30 },
   17117              :   { 0x00000007, VEC_BCAST_PSRLD,  29 },
   17118              :   { 0x0000000f, VEC_BCAST_PSRLD,  28 },
   17119              :   { 0x0000001f, VEC_BCAST_PSRLD,  27 },
   17120              :   { 0x0000003f, VEC_BCAST_PSRLD,  26 },
   17121              :   { 0x0000007f, VEC_BCAST_PSRLD,  25 },
   17122              :   { 0x000000ff, VEC_BCAST_PSRLD,  24 },
   17123              :   { 0x000001ff, VEC_BCAST_PSRLD,  23 },
   17124              :   { 0x000003ff, VEC_BCAST_PSRLD,  22 },
   17125              :   { 0x000007ff, VEC_BCAST_PSRLD,  21 },
   17126              :   { 0x00000fff, VEC_BCAST_PSRLD,  20 },
   17127              :   { 0x00001fff, VEC_BCAST_PSRLD,  19 },
   17128              :   { 0x00003fff, VEC_BCAST_PSRLD,  18 },
   17129              :   { 0x00007fff, VEC_BCAST_PSRLD,  17 },
   17130              :   { 0x0000ffff, VEC_BCAST_PSRLD,  16 },
   17131              :   { 0x00010001, VEC_BCAST_PSRLW,  15 },
   17132              :   { 0x0001ffff, VEC_BCAST_PSRLD,  15 },
   17133              :   { 0x00030003, VEC_BCAST_PSRLW,  14 },
   17134              :   { 0x0003ffff, VEC_BCAST_PSRLD,  14 },
   17135              :   { 0x00070007, VEC_BCAST_PSRLW,  13 },
   17136              :   { 0x0007ffff, VEC_BCAST_PSRLD,  13 },
   17137              :   { 0x000f000f, VEC_BCAST_PSRLW,  12 },
   17138              :   { 0x000fffff, VEC_BCAST_PSRLD,  12 },
   17139              :   { 0x001f001f, VEC_BCAST_PSRLW,  11 },
   17140              :   { 0x001fffff, VEC_BCAST_PSRLD,  11 },
   17141              :   { 0x003f003f, VEC_BCAST_PSRLW,  10 },
   17142              :   { 0x003fffff, VEC_BCAST_PSRLD,  10 },
   17143              :   { 0x007f007f, VEC_BCAST_PSRLW,   9 },
   17144              :   { 0x007fffff, VEC_BCAST_PSRLD,   9 },
   17145              :   { 0x00ff00ff, VEC_BCAST_PSRLW,   8 },
   17146              :   { 0x00ffffff, VEC_BCAST_PSRLD,   8 },
   17147              :   { 0x01010101, VEC_BCAST_PABSB,   0 },
   17148              :   { 0x01ff01ff, VEC_BCAST_PSRLW,   7 },
   17149              :   { 0x01ffffff, VEC_BCAST_PSRLD,   7 },
   17150              :   { 0x03ff03ff, VEC_BCAST_PSRLW,   6 },
   17151              :   { 0x03ffffff, VEC_BCAST_PSRLD,   6 },
   17152              :   { 0x07ff07ff, VEC_BCAST_PSRLW,   5 },
   17153              :   { 0x07ffffff, VEC_BCAST_PSRLD,   5 },
   17154              :   { 0x0fff0fff, VEC_BCAST_PSRLW,   4 },
   17155              :   { 0x0fffffff, VEC_BCAST_PSRLD,   4 },
   17156              :   { 0x1fff1fff, VEC_BCAST_PSRLW,   3 },
   17157              :   { 0x1fffffff, VEC_BCAST_PSRLD,   3 },
   17158              :   { 0x3fff3fff, VEC_BCAST_PSRLW,   2 },
   17159              :   { 0x3fffffff, VEC_BCAST_PSRLD,   2 },
   17160              :   { 0x7fff7fff, VEC_BCAST_PSRLW,   1 },
   17161              :   { 0x7fffffff, VEC_BCAST_PSRLD,   1 },
   17162              :   { 0x80000000, VEC_BCAST_PSLLD,  31 },
   17163              :   { 0x80008000, VEC_BCAST_PSLLW,  15 },
   17164              :   { 0xc0000000, VEC_BCAST_PSLLD,  30 },
   17165              :   { 0xc000c000, VEC_BCAST_PSLLW,  14 },
   17166              :   { 0xe0000000, VEC_BCAST_PSLLD,  29 },
   17167              :   { 0xe000e000, VEC_BCAST_PSLLW,  13 },
   17168              :   { 0xf0000000, VEC_BCAST_PSLLD,  28 },
   17169              :   { 0xf000f000, VEC_BCAST_PSLLW,  12 },
   17170              :   { 0xf8000000, VEC_BCAST_PSLLD,  27 },
   17171              :   { 0xf800f800, VEC_BCAST_PSLLW,  11 },
   17172              :   { 0xfc000000, VEC_BCAST_PSLLD,  26 },
   17173              :   { 0xfc00fc00, VEC_BCAST_PSLLW,  10 },
   17174              :   { 0xfe000000, VEC_BCAST_PSLLD,  25 },
   17175              :   { 0xfe00fe00, VEC_BCAST_PSLLW,   9 },
   17176              :   { 0xfefefefe, VEC_BCAST_PADDB,   0 },
   17177              :   { 0xff000000, VEC_BCAST_PSLLD,  24 },
   17178              :   { 0xff00ff00, VEC_BCAST_PSLLW,   8 },
   17179              :   { 0xff800000, VEC_BCAST_PSLLD,  23 },
   17180              :   { 0xff80ff80, VEC_BCAST_PSLLW,   7 },
   17181              :   { 0xffc00000, VEC_BCAST_PSLLD,  22 },
   17182              :   { 0xffc0ffc0, VEC_BCAST_PSLLW,   6 },
   17183              :   { 0xffe00000, VEC_BCAST_PSLLD,  21 },
   17184              :   { 0xffe0ffe0, VEC_BCAST_PSLLW,   5 },
   17185              :   { 0xfff00000, VEC_BCAST_PSLLD,  20 },
   17186              :   { 0xfff0fff0, VEC_BCAST_PSLLW,   4 },
   17187              :   { 0xfff80000, VEC_BCAST_PSLLD,  19 },
   17188              :   { 0xfff8fff8, VEC_BCAST_PSLLW,   3 },
   17189              :   { 0xfffc0000, VEC_BCAST_PSLLD,  18 },
   17190              :   { 0xfffcfffc, VEC_BCAST_PSLLW,   2 },
   17191              :   { 0xfffe0000, VEC_BCAST_PSLLD,  17 },
   17192              :   { 0xfffefffe, VEC_BCAST_PSLLW,   1 },
   17193              :   { 0xffff0000, VEC_BCAST_PSLLD,  16 },
   17194              :   { 0xffff8000, VEC_BCAST_PSLLD,  15 },
   17195              :   { 0xffffc000, VEC_BCAST_PSLLD,  14 },
   17196              :   { 0xffffe000, VEC_BCAST_PSLLD,  13 },
   17197              :   { 0xfffff000, VEC_BCAST_PSLLD,  12 },
   17198              :   { 0xfffff800, VEC_BCAST_PSLLD,  11 },
   17199              :   { 0xfffffc00, VEC_BCAST_PSLLD,  10 },
   17200              :   { 0xfffffe00, VEC_BCAST_PSLLD,   9 },
   17201              :   { 0xffffff00, VEC_BCAST_PSLLD,   8 },
   17202              :   { 0xffffff80, VEC_BCAST_PSLLD,   7 },
   17203              :   { 0xffffffc0, VEC_BCAST_PSLLD,   6 },
   17204              :   { 0xffffffe0, VEC_BCAST_PSLLD,   5 },
   17205              :   { 0xfffffff0, VEC_BCAST_PSLLD,   4 },
   17206              :   { 0xfffffff8, VEC_BCAST_PSLLD,   3 },
   17207              :   { 0xfffffffc, VEC_BCAST_PSLLD,   2 },
   17208              :   { 0xfffffffe, VEC_BCAST_PSLLD,   1 },
   17209              :   { 0xffffffff, VEC_BCAST_PCMPEQ,  0 }
   17210              : };
   17211              : 
   17212              : /* Comparator for bsearch on ix86_vec_bcast_map.  */
   17213              : static int
   17214       314992 : ix86_vec_bcast_map_simode_cmp (const void *key, const void *entry)
   17215              : {
   17216       314992 :   return (*(const unsigned int*)key)
   17217       314992 :          - ((const ix86_vec_bcast_map_simode_t*)entry)->key;
   17218              : }
   17219              : 
   17220              : /* A subroutine of ix86_vector_duplicate_value.  Tries to efficiently
   17221              :    materialize V4SImode, V8SImode and V16SImode vectors from SImode
   17222              :    integer constants.  */
   17223              : static bool
   17224        47770 : ix86_vector_duplicate_simode_const (machine_mode mode, rtx target,
   17225              :                                     unsigned int val)
   17226              : {
   17227        47770 :   const ix86_vec_bcast_map_simode_t *entry;
   17228        47770 :   rtx tmp1, tmp2;
   17229              : 
   17230        47770 :   entry = (const ix86_vec_bcast_map_simode_t*)
   17231        47770 :           bsearch(&val, ix86_vec_bcast_map_simode,
   17232              :                   ARRAY_SIZE (ix86_vec_bcast_map_simode),
   17233              :                   sizeof (ix86_vec_bcast_map_simode_t),
   17234              :                   ix86_vec_bcast_map_simode_cmp);
   17235        47770 :   if (!entry)
   17236              :     return false;
   17237              : 
   17238        16085 :   switch (entry->alg)
   17239              :     {
   17240            0 :     case VEC_BCAST_PXOR:
   17241            0 :       if ((mode == V8SImode && !TARGET_AVX2)
   17242            0 :           || (mode == V16SImode && !TARGET_AVX512F))
   17243              :         return false;
   17244            0 :       emit_move_insn (target, CONST0_RTX (mode));
   17245            0 :       return true;
   17246              : 
   17247          155 :     case VEC_BCAST_PCMPEQ:
   17248          155 :       if ((mode == V4SImode && !TARGET_SSE2)
   17249          154 :           || (mode == V8SImode && !TARGET_AVX2)
   17250          127 :           || (mode == V16SImode && !TARGET_AVX512F))
   17251              :         return false;
   17252          127 :       emit_move_insn (target, CONSTM1_RTX (mode));
   17253          127 :       return true;
   17254              : 
   17255          585 :     case VEC_BCAST_PABSB:
   17256          585 :       if (mode == V4SImode && TARGET_SSE2)
   17257              :         {
   17258          460 :           tmp1 = gen_reg_rtx (V16QImode);
   17259          460 :           emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
   17260          460 :           tmp2 = gen_reg_rtx (V16QImode);
   17261          460 :           emit_insn (gen_absv16qi2 (tmp2, tmp1));
   17262              :         }
   17263          125 :       else if (mode == V8SImode && TARGET_AVX2)
   17264              :         {
   17265           68 :           tmp1 = gen_reg_rtx (V32QImode);
   17266           68 :           emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
   17267           68 :           tmp2 = gen_reg_rtx (V32QImode);
   17268           68 :           emit_insn (gen_absv32qi2 (tmp2, tmp1));
   17269              :         }
   17270           57 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17271              :         {
   17272           49 :           tmp1 = gen_reg_rtx (V64QImode);
   17273           49 :           emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
   17274           49 :           tmp2 = gen_reg_rtx (V64QImode);
   17275           49 :           emit_insn (gen_absv64qi2 (tmp2, tmp1));
   17276              :         }
   17277              :       else
   17278              :         return false;
   17279              :       break;
   17280              : 
   17281          101 :     case VEC_BCAST_PADDB:
   17282          101 :       if (mode == V4SImode && TARGET_SSE2)
   17283              :         {
   17284           97 :           tmp1 = gen_reg_rtx (V16QImode);
   17285           97 :           emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
   17286           97 :           tmp2 = gen_reg_rtx (V16QImode);
   17287           97 :           emit_insn (gen_addv16qi3 (tmp2, tmp1, tmp1));
   17288              :         }
   17289            4 :       else if (mode == V8SImode && TARGET_AVX2)
   17290              :         {
   17291            1 :           tmp1 = gen_reg_rtx (V32QImode);
   17292            1 :           emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
   17293            1 :           tmp2 = gen_reg_rtx (V32QImode);
   17294            1 :           emit_insn (gen_addv32qi3 (tmp2, tmp1, tmp1));
   17295              :         }
   17296            3 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17297              :         {
   17298            3 :           tmp1 = gen_reg_rtx (V64QImode);
   17299            3 :           emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
   17300            3 :           tmp2 = gen_reg_rtx (V64QImode);
   17301            3 :           emit_insn (gen_addv64qi3 (tmp2, tmp1, tmp1));
   17302              :         }
   17303              :       else
   17304              :         return false;
   17305              :       break;
   17306              : 
   17307         3659 :     case VEC_BCAST_PSRLW:
   17308         3659 :       if (mode == V4SImode && TARGET_SSE2)
   17309              :         {
   17310         3435 :           tmp1 = gen_reg_rtx (V8HImode);
   17311         3435 :           emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
   17312         3435 :           tmp2 = gen_reg_rtx (V8HImode);
   17313         3435 :           emit_insn (gen_lshrv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17314              :         }
   17315          224 :       else if (mode == V8SImode && TARGET_AVX2)
   17316              :         {
   17317          131 :           tmp1 = gen_reg_rtx (V16HImode);
   17318          131 :           emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
   17319          131 :           tmp2 = gen_reg_rtx (V16HImode);
   17320          131 :           emit_insn (gen_lshrv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17321              :         }
   17322           93 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17323              :         {
   17324           90 :           tmp1 = gen_reg_rtx (V32HImode);
   17325           90 :           emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
   17326           90 :           tmp2 = gen_reg_rtx (V32HImode);
   17327           90 :           emit_insn (gen_lshrv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17328              :         }
   17329              :       else
   17330              :         return false;
   17331              :       break;
   17332              : 
   17333         9824 :     case VEC_BCAST_PSRLD:
   17334         9824 :       if (mode == V4SImode && TARGET_SSE2)
   17335              :         {
   17336         6981 :           tmp1 = gen_reg_rtx (V4SImode);
   17337         6981 :           emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
   17338         6981 :           emit_insn (gen_lshrv4si3 (target, tmp1, GEN_INT (entry->arg)));
   17339         6981 :           return true;
   17340              :         }
   17341         2843 :       else if (mode == V8SImode && TARGET_AVX2)
   17342              :         {
   17343         1056 :           tmp1 = gen_reg_rtx (V8SImode);
   17344         1056 :           emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
   17345         1056 :           emit_insn (gen_lshrv8si3 (target, tmp1, GEN_INT (entry->arg)));
   17346         1056 :           return true;
   17347              :         }
   17348         1787 :       else if (mode == V16SImode && TARGET_AVX512F)
   17349              :         {
   17350          948 :           tmp1 = gen_reg_rtx (V16SImode);
   17351          948 :           emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
   17352          948 :           emit_insn (gen_lshrv16si3 (target, tmp1, GEN_INT (entry->arg)));
   17353          948 :           return true;
   17354              :         }
   17355              :       else
   17356              :         return false;
   17357          132 :       break;
   17358              : 
   17359          132 :     case VEC_BCAST_PSLLW:
   17360          132 :       if (mode == V4SImode && TARGET_SSE2)
   17361              :         {
   17362          102 :           tmp1 = gen_reg_rtx (V8HImode);
   17363          102 :           emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
   17364          102 :           tmp2 = gen_reg_rtx (V8HImode);
   17365          102 :           emit_insn (gen_ashlv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17366              :         }
   17367           30 :       else if (mode == V8SImode && TARGET_AVX2)
   17368              :         {
   17369           21 :           tmp1 = gen_reg_rtx (V16HImode);
   17370           21 :           emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
   17371           21 :           tmp2 = gen_reg_rtx (V16HImode);
   17372           21 :           emit_insn (gen_ashlv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17373              :         }
   17374            9 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17375              :         {
   17376            9 :           tmp1 = gen_reg_rtx (V32HImode);
   17377            9 :           emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
   17378            9 :           tmp2 = gen_reg_rtx (V32HImode);
   17379            9 :           emit_insn (gen_ashlv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17380              :         }
   17381              :       else
   17382              :         return false;
   17383              :       break;
   17384              : 
   17385         1629 :     case VEC_BCAST_PSLLD:
   17386         1629 :       if (mode == V4SImode && TARGET_SSE2)
   17387              :         {
   17388         1594 :           tmp1 = gen_reg_rtx (V4SImode);
   17389         1594 :           emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
   17390         1594 :           emit_insn (gen_ashlv4si3 (target, tmp1, GEN_INT (entry->arg)));
   17391         1594 :           return true;
   17392              :         }
   17393           35 :       else if (mode == V8SImode && TARGET_AVX2)
   17394              :         {
   17395           17 :           tmp1 = gen_reg_rtx (V8SImode);
   17396           17 :           emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
   17397           17 :           emit_insn (gen_ashlv8si3 (target, tmp1, GEN_INT (entry->arg)));
   17398           17 :           return true;
   17399              :         }
   17400           18 :       else if (mode == V16SImode && TARGET_AVX512F)
   17401              :         {
   17402           18 :           tmp1 = gen_reg_rtx (V16SImode);
   17403           18 :           emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
   17404           18 :           emit_insn (gen_ashlv16si3 (target, tmp1, GEN_INT (entry->arg)));
   17405           18 :           return true;
   17406              :         }
   17407              :       else
   17408              :         return false;
   17409              : 
   17410              :     default:
   17411              :       return false;
   17412              :     }
   17413              : 
   17414         4466 :   emit_move_insn (target, gen_lowpart (mode, tmp2));
   17415         4466 :   return true;
   17416              : }
   17417              : 
   17418              : /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
   17419              :    fill target with val via vec_duplicate.  */
   17420              : 
   17421              : static bool
   17422       147409 : ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
   17423              : {
   17424       147409 :   bool ok;
   17425       147409 :   rtx_insn *insn;
   17426       147409 :   rtx dup;
   17427              : 
   17428       147409 :   if ((mode == V4SImode || mode == V8SImode || mode == V16SImode)
   17429        55622 :       && CONST_INT_P (val)
   17430        47770 :       && ix86_vector_duplicate_simode_const (mode, target, INTVAL (val)))
   17431              :     return true;
   17432              : 
   17433              :   /* Save/restore recog_data in case this is called from splitters
   17434              :      or other routines where recog_data needs to stay valid across
   17435              :      force_reg.  See PR106577.  */
   17436       132202 :   recog_data_d recog_data_save = recog_data;
   17437              : 
   17438              :   /* First attempt to recognize VAL as-is.  */
   17439       132202 :   dup = gen_vec_duplicate (mode, val);
   17440       132202 :   insn = emit_insn (gen_rtx_SET (target, dup));
   17441       132202 :   if (recog_memoized (insn) < 0)
   17442              :     {
   17443        94793 :       rtx_insn *seq;
   17444        94793 :       machine_mode innermode = GET_MODE_INNER (mode);
   17445        94793 :       rtx reg;
   17446              : 
   17447              :       /* If that fails, force VAL into a register or mem.  */
   17448              : 
   17449        94793 :       start_sequence ();
   17450              : 
   17451            0 :       if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
   17452            0 :           && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
   17453        94793 :           && GET_MODE_BITSIZE(mode) >= 128)
   17454            0 :         reg = validize_mem (force_const_mem (innermode, val));
   17455              :       else
   17456              :         {
   17457        94793 :           reg = force_reg (innermode, val);
   17458        94793 :           if (GET_MODE (reg) != innermode)
   17459            0 :             reg = gen_lowpart (innermode, reg);
   17460              :         }
   17461              : 
   17462        94793 :       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
   17463        94793 :       seq = end_sequence ();
   17464        94793 :       if (seq)
   17465        94793 :         emit_insn_before (seq, insn);
   17466              : 
   17467        94793 :       ok = recog_memoized (insn) >= 0;
   17468        94793 :       gcc_assert (ok);
   17469              :     }
   17470       132202 :   recog_data = recog_data_save;
   17471       132202 :   return true;
   17472              : }
   17473              : 
   17474              : /* Get a vector mode of the same size as the original but with elements
   17475              :    twice as wide.  This is only guaranteed to apply to integral vectors.  */
   17476              : 
   17477              : static machine_mode
   17478        19205 : get_mode_wider_vector (machine_mode o)
   17479              : {
   17480              :   /* ??? Rely on the ordering that genmodes.cc gives to vectors.  */
   17481        19205 :   machine_mode n = GET_MODE_NEXT_MODE (o).require ();
   17482        57615 :   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
   17483        57615 :   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
   17484        19205 :   return n;
   17485              : }
   17486              : 
   17487              : static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
   17488              : static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
   17489              : 
   17490              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   17491              :    with all elements equal to VAR.  Return true if successful.  */
   17492              : 
   17493              : bool
   17494       167855 : ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
   17495              :                                    rtx target, rtx val)
   17496              : {
   17497       167855 :   bool ok;
   17498              : 
   17499       167855 :   switch (mode)
   17500              :     {
   17501        70173 :     case E_V2DImode:
   17502        70173 :       if (CONST_INT_P (val))
   17503              :         {
   17504        61390 :           int tmp = (int)INTVAL (val);
   17505        61390 :           if (tmp == (int)(INTVAL (val) >> 32))
   17506              :             {
   17507          109 :               rtx reg = gen_reg_rtx (V4SImode);
   17508          109 :               ok = ix86_vector_duplicate_value (V4SImode, reg,
   17509              :                                                 GEN_INT (tmp));
   17510          109 :               if (ok)
   17511              :                 {
   17512          109 :                   emit_move_insn (target, gen_lowpart (V2DImode, reg));
   17513          109 :                   return true;
   17514              :                 }
   17515              :             }
   17516              :         }
   17517        70064 :       return ix86_vector_duplicate_value (mode, target, val);
   17518              : 
   17519          995 :     case E_V4DImode:
   17520          995 :       if (CONST_INT_P (val))
   17521              :         {
   17522          718 :           int tmp = (int)INTVAL (val);
   17523          718 :           if (tmp == (int)(INTVAL (val) >> 32))
   17524              :             {
   17525           54 :               rtx reg = gen_reg_rtx (V8SImode);
   17526           54 :               ok = ix86_vector_duplicate_value (V8SImode, reg,
   17527              :                                                 GEN_INT (tmp));
   17528           54 :               if (ok)
   17529              :                 {
   17530           54 :                   emit_move_insn (target, gen_lowpart (V4DImode, reg));
   17531           54 :                   return true;
   17532              :                 }
   17533              :             }
   17534              :         }
   17535          941 :       return ix86_vector_duplicate_value (mode, target, val);
   17536              : 
   17537          463 :     case E_V8DImode:
   17538          463 :       if (CONST_INT_P (val))
   17539              :         {
   17540          264 :           int tmp = (int)INTVAL (val);
   17541          264 :           if (tmp == (int)(INTVAL (val) >> 32))
   17542              :             {
   17543           24 :               rtx reg = gen_reg_rtx (V16SImode);
   17544           24 :               ok = ix86_vector_duplicate_value (V16SImode, reg,
   17545              :                                                 GEN_INT (tmp));
   17546           24 :               if (ok)
   17547              :                 {
   17548           24 :                   emit_move_insn (target, gen_lowpart (V8DImode, reg));
   17549           24 :                   return true;
   17550              :                 }
   17551              :             }
   17552              :         }
   17553          439 :       return ix86_vector_duplicate_value (mode, target, val);
   17554              : 
   17555         2604 :     case E_V2SImode:
   17556         2604 :     case E_V2SFmode:
   17557         2604 :       if (!mmx_ok)
   17558              :         return false;
   17559              :       /* FALLTHRU */
   17560              : 
   17561        74792 :     case E_V4DFmode:
   17562        74792 :     case E_V8SFmode:
   17563        74792 :     case E_V8SImode:
   17564        74792 :     case E_V2DFmode:
   17565        74792 :     case E_V4SFmode:
   17566        74792 :     case E_V4SImode:
   17567        74792 :     case E_V16SImode:
   17568        74792 :     case E_V16SFmode:
   17569        74792 :     case E_V8DFmode:
   17570        74792 :       return ix86_vector_duplicate_value (mode, target, val);
   17571              : 
   17572          387 :     case E_V4HImode:
   17573          387 :       if (!mmx_ok)
   17574              :         return false;
   17575          384 :       if (TARGET_SSE || TARGET_3DNOW_A)
   17576              :         {
   17577          384 :           rtx x;
   17578              : 
   17579          384 :           val = gen_lowpart (SImode, val);
   17580          384 :           if (CONST_INT_P (val))
   17581              :             return false;
   17582          382 :           x = gen_rtx_TRUNCATE (HImode, val);
   17583          382 :           x = gen_rtx_VEC_DUPLICATE (mode, x);
   17584          382 :           emit_insn (gen_rtx_SET (target, x));
   17585          382 :           return true;
   17586              :         }
   17587            0 :       goto widen;
   17588              : 
   17589            5 :     case E_V4HFmode:
   17590            5 :     case E_V4BFmode:
   17591            5 :       if (TARGET_MMX_WITH_SSE)
   17592              :         {
   17593           10 :           val = force_reg (GET_MODE_INNER (mode), val);
   17594            5 :           rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
   17595            5 :           emit_insn (gen_rtx_SET (target, x));
   17596            5 :           return true;
   17597              :         }
   17598              :       return false;
   17599              : 
   17600          126 :     case E_V2HImode:
   17601          126 :       if (TARGET_SSE2)
   17602              :         {
   17603          126 :           rtx x;
   17604              : 
   17605          126 :           val = gen_lowpart (SImode, val);
   17606          126 :           if (CONST_INT_P (val))
   17607              :             return false;
   17608          126 :           x = gen_rtx_TRUNCATE (HImode, val);
   17609          126 :           x = gen_rtx_VEC_DUPLICATE (mode, x);
   17610          126 :           emit_insn (gen_rtx_SET (target, x));
   17611          126 :           return true;
   17612              :         }
   17613              :       return false;
   17614              : 
   17615            3 :     case E_V2HFmode:
   17616            3 :     case E_V2BFmode:
   17617            3 :       if (TARGET_SSE2)
   17618              :         {
   17619            6 :           val = force_reg (GET_MODE_INNER (mode), val);
   17620            3 :           rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
   17621            3 :           emit_insn (gen_rtx_SET (target, x));
   17622            3 :           return true;
   17623              :         }
   17624              :       return false;
   17625              : 
   17626          303 :     case E_V8QImode:
   17627          303 :     case E_V4QImode:
   17628          303 :       if (!mmx_ok)
   17629              :         return false;
   17630          299 :       goto widen;
   17631              : 
   17632        10285 :     case E_V8HImode:
   17633        10285 :       if (CONST_INT_P (val))
   17634         9763 :         goto widen;
   17635              :       /* FALLTHRU */
   17636              : 
   17637          836 :     case E_V8HFmode:
   17638          836 :     case E_V8BFmode:
   17639          836 :       if (TARGET_AVX2)
   17640          392 :         return ix86_vector_duplicate_value (mode, target, val);
   17641              : 
   17642          444 :       if (TARGET_SSE2)
   17643              :         {
   17644         1135 :           struct expand_vec_perm_d dperm;
   17645         1135 :           rtx tmp1, tmp2;
   17646              : 
   17647          444 :         permute:
   17648         1135 :           memset (&dperm, 0, sizeof (dperm));
   17649         1135 :           dperm.target = target;
   17650         1135 :           dperm.vmode = mode;
   17651         1135 :           dperm.nelt = GET_MODE_NUNITS (mode);
   17652         1135 :           dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
   17653         1135 :           dperm.one_operand_p = true;
   17654              : 
   17655         1135 :           if (mode == V8HFmode || mode == V8BFmode)
   17656              :             {
   17657            3 :               tmp1 = force_reg (GET_MODE_INNER (mode), val);
   17658            3 :               tmp2 = gen_reg_rtx (mode);
   17659            3 :               emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
   17660            3 :               tmp1 = gen_lowpart (mode, tmp2);
   17661              :             }
   17662              :           else
   17663              :             {
   17664              :               /* Extend to SImode using a paradoxical SUBREG.  */
   17665         1132 :               tmp1 = gen_reg_rtx (SImode);
   17666         1132 :               emit_move_insn (tmp1, gen_lowpart (SImode, val));
   17667              : 
   17668              :               /* Insert the SImode value as
   17669              :                  low element of a V4SImode vector.  */
   17670         1132 :               tmp2 = gen_reg_rtx (V4SImode);
   17671         1132 :               emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
   17672         1132 :               tmp1 = gen_lowpart (mode, tmp2);
   17673              :             }
   17674              : 
   17675         1135 :           emit_move_insn (dperm.op0, tmp1);
   17676         1135 :           ok = (expand_vec_perm_1 (&dperm)
   17677         1135 :                 || expand_vec_perm_broadcast_1 (&dperm));
   17678            0 :           gcc_assert (ok);
   17679         1135 :           return ok;
   17680              :         }
   17681            0 :       goto widen;
   17682              : 
   17683         6011 :     case E_V16QImode:
   17684         6011 :       if (CONST_INT_P (val))
   17685         5260 :         goto widen;
   17686          751 :       if (TARGET_AVX2)
   17687           60 :         return ix86_vector_duplicate_value (mode, target, val);
   17688              : 
   17689          691 :       if (TARGET_SSE2)
   17690          691 :         goto permute;
   17691            0 :       goto widen;
   17692              : 
   17693        17653 :     widen:
   17694              :       /* Replicate the value once into the next wider mode and recurse.  */
   17695        17653 :       {
   17696        17653 :         machine_mode smode, wsmode, wvmode;
   17697        17653 :         rtx x;
   17698              : 
   17699        17653 :         smode = GET_MODE_INNER (mode);
   17700        17653 :         wvmode = get_mode_wider_vector (mode);
   17701        17653 :         wsmode = GET_MODE_INNER (wvmode);
   17702              : 
   17703        17653 :         val = convert_modes (wsmode, smode, val, true);
   17704              : 
   17705        17653 :         if (CONST_INT_P (val))
   17706              :           {
   17707        34710 :             x = simplify_binary_operation (ASHIFT, wsmode, val,
   17708        17355 :                                            GEN_INT (GET_MODE_BITSIZE (smode)));
   17709        17355 :             val = simplify_binary_operation (IOR, wsmode, val, x);
   17710              :           }
   17711          298 :         else if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
   17712          298 :           emit_insn (gen_insv_1 (wsmode, val, val));
   17713              :         else
   17714              :           {
   17715            0 :             x = expand_simple_binop (wsmode, ASHIFT, val,
   17716            0 :                                      GEN_INT (GET_MODE_BITSIZE (smode)),
   17717              :                                      NULL_RTX, 1, OPTAB_LIB_WIDEN);
   17718            0 :             val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
   17719              :                                        OPTAB_LIB_WIDEN);
   17720              :           }
   17721              : 
   17722        17653 :         x = gen_reg_rtx (wvmode);
   17723        17653 :         ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
   17724        17653 :         if (!ok)
   17725              :           return false;
   17726        17652 :         emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
   17727        17652 :         return true;
   17728              :       }
   17729              : 
   17730         1474 :     case E_V16HImode:
   17731         1474 :     case E_V32QImode:
   17732         1474 :       if (CONST_INT_P (val))
   17733         1182 :         goto widen;
   17734              :       /* FALLTHRU */
   17735              : 
   17736          375 :     case E_V16HFmode:
   17737          375 :     case E_V16BFmode:
   17738          375 :       if (TARGET_AVX2)
   17739          347 :         return ix86_vector_duplicate_value (mode, target, val);
   17740              :       else
   17741              :         {
   17742           28 :           machine_mode hvmode;
   17743           28 :           switch (mode)
   17744              :             {
   17745              :             case V16HImode:
   17746              :               hvmode = V8HImode;
   17747              :               break;
   17748            0 :             case V16HFmode:
   17749            0 :               hvmode = V8HFmode;
   17750            0 :               break;
   17751            1 :             case V16BFmode:
   17752            1 :               hvmode = V8BFmode;
   17753            1 :               break;
   17754           14 :             case V32QImode:
   17755           14 :               hvmode = V16QImode;
   17756           14 :               break;
   17757            0 :             default:
   17758            0 :               gcc_unreachable ();
   17759              :             }
   17760           28 :           rtx x = gen_reg_rtx (hvmode);
   17761              : 
   17762           28 :           ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
   17763           28 :           if (!ok)
   17764              :             return false;
   17765              : 
   17766           28 :           x = gen_rtx_VEC_CONCAT (mode, x, x);
   17767           28 :           emit_insn (gen_rtx_SET (target, x));
   17768              :         }
   17769           28 :       return true;
   17770              : 
   17771         1277 :     case E_V32HImode:
   17772         1277 :     case E_V64QImode:
   17773         1277 :       if (CONST_INT_P (val))
   17774         1149 :         goto widen;
   17775              :       /* FALLTHRU */
   17776              : 
   17777          207 :     case E_V32HFmode:
   17778          207 :     case E_V32BFmode:
   17779          207 :       if (TARGET_AVX512BW)
   17780          187 :         return ix86_vector_duplicate_value (mode, target, val);
   17781              :       else
   17782              :         {
   17783           20 :           machine_mode hvmode;
   17784           20 :           switch (mode)
   17785              :             {
   17786              :             case V32HImode:
   17787              :               hvmode = V16HImode;
   17788              :               break;
   17789            0 :             case V32HFmode:
   17790            0 :               hvmode = V16HFmode;
   17791            0 :               break;
   17792            1 :             case V32BFmode:
   17793            1 :               hvmode = V16BFmode;
   17794            1 :               break;
   17795           10 :             case V64QImode:
   17796           10 :               hvmode = V32QImode;
   17797           10 :               break;
   17798            0 :             default:
   17799            0 :               gcc_unreachable ();
   17800              :             }
   17801           20 :           rtx x = gen_reg_rtx (hvmode);
   17802              : 
   17803           20 :           ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
   17804           20 :           if (!ok)
   17805              :             return false;
   17806              : 
   17807           20 :           x = gen_rtx_VEC_CONCAT (mode, x, x);
   17808           20 :           emit_insn (gen_rtx_SET (target, x));
   17809              :         }
   17810           20 :       return true;
   17811              : 
   17812              :     default:
   17813              :       return false;
   17814              :     }
   17815              : }
   17816              : 
   17817              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   17818              :    whose ONE_VAR element is VAR, and other elements are zero.  Return true
   17819              :    if successful.  */
   17820              : 
   17821              : bool
   17822        10335 : ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
   17823              :                                      rtx target, rtx var, int one_var)
   17824              : {
   17825        10335 :   machine_mode vsimode;
   17826        10335 :   rtx new_target;
   17827        10335 :   rtx x, tmp;
   17828        10335 :   bool use_vector_set = false;
   17829        10335 :   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
   17830              : 
   17831        10335 :   switch (mode)
   17832              :     {
   17833         7925 :     case E_V2DImode:
   17834              :       /* For SSE4.1, we normally use vector set.  But if the second
   17835              :          element is zero and inter-unit moves are OK, we use movq
   17836              :          instead.  */
   17837         7916 :       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
   17838         8048 :                         && !(TARGET_INTER_UNIT_MOVES_TO_VEC
   17839              :                              && one_var == 0));
   17840              :       break;
   17841          858 :     case E_V16QImode:
   17842          858 :     case E_V4SImode:
   17843          858 :     case E_V4SFmode:
   17844          858 :       use_vector_set = TARGET_SSE4_1;
   17845          858 :       break;
   17846           85 :     case E_V8HImode:
   17847           85 :       use_vector_set = TARGET_SSE2;
   17848           85 :       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
   17849           85 :         ? gen_vec_setv8hi_0 : NULL;
   17850              :       break;
   17851            4 :     case E_V8QImode:
   17852            4 :       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   17853              :       break;
   17854           14 :     case E_V4HImode:
   17855           14 :     case E_V4HFmode:
   17856           14 :     case E_V4BFmode:
   17857           14 :       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
   17858              :       break;
   17859           32 :     case E_V4QImode:
   17860           32 :       use_vector_set = TARGET_SSE4_1;
   17861           32 :       break;
   17862            0 :     case E_V32QImode:
   17863            0 :       use_vector_set = TARGET_AVX;
   17864            0 :       break;
   17865            5 :     case E_V16HImode:
   17866            5 :       use_vector_set = TARGET_AVX;
   17867            5 :       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
   17868            5 :         ? gen_vec_setv16hi_0 : NULL;
   17869              :       break;
   17870            5 :     case E_V8SImode:
   17871            5 :       use_vector_set = TARGET_AVX;
   17872            5 :       gen_vec_set_0 = gen_vec_setv8si_0;
   17873            5 :       break;
   17874           22 :     case E_V8SFmode:
   17875           22 :       use_vector_set = TARGET_AVX;
   17876           22 :       gen_vec_set_0 = gen_vec_setv8sf_0;
   17877           22 :       break;
   17878           13 :     case E_V4DFmode:
   17879           13 :       use_vector_set = TARGET_AVX;
   17880           13 :       gen_vec_set_0 = gen_vec_setv4df_0;
   17881           13 :       break;
   17882            7 :     case E_V4DImode:
   17883              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   17884            7 :       use_vector_set = TARGET_AVX && TARGET_64BIT;
   17885              :       gen_vec_set_0 = gen_vec_setv4di_0;
   17886              :       break;
   17887           17 :     case E_V16SImode:
   17888           17 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17889              :       gen_vec_set_0 = gen_vec_setv16si_0;
   17890              :       break;
   17891           22 :     case E_V16SFmode:
   17892           22 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17893              :       gen_vec_set_0 = gen_vec_setv16sf_0;
   17894              :       break;
   17895            0 :     case E_V8DFmode:
   17896            0 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17897              :       gen_vec_set_0 = gen_vec_setv8df_0;
   17898              :       break;
   17899            2 :     case E_V8DImode:
   17900              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   17901            2 :       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
   17902              :       gen_vec_set_0 = gen_vec_setv8di_0;
   17903              :       break;
   17904           39 :     case E_V8HFmode:
   17905           39 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17906              :       gen_vec_set_0 = gen_vec_setv8hf_0;
   17907              :       break;
   17908            9 :     case E_V16HFmode:
   17909            9 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17910              :       gen_vec_set_0 = gen_vec_setv16hf_0;
   17911              :       break;
   17912            6 :     case E_V32HFmode:
   17913            6 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17914              :       gen_vec_set_0 = gen_vec_setv32hf_0;
   17915              :       break;
   17916            2 :     case E_V8BFmode:
   17917            2 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17918              :       gen_vec_set_0 = gen_vec_setv8bf_0;
   17919              :       break;
   17920            0 :     case E_V16BFmode:
   17921            0 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17922              :       gen_vec_set_0 = gen_vec_setv16bf_0;
   17923              :       break;
   17924            0 :     case E_V32BFmode:
   17925            0 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17926              :       gen_vec_set_0 = gen_vec_setv32bf_0;
   17927              :       break;
   17928            4 :     case E_V32HImode:
   17929            4 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17930              :       gen_vec_set_0 = gen_vec_setv32hi_0;
   17931              :     default:
   17932              :       break;
   17933              :     }
   17934              : 
   17935         8959 :   if (use_vector_set)
   17936              :     {
   17937          857 :       if (gen_vec_set_0 && one_var == 0)
   17938              :         {
   17939          354 :           var = force_reg (GET_MODE_INNER (mode), var);
   17940          177 :           emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
   17941          177 :           return true;
   17942              :         }
   17943          680 :       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
   17944         1360 :       var = force_reg (GET_MODE_INNER (mode), var);
   17945          680 :       ix86_expand_vector_set (mmx_ok, target, var, one_var);
   17946          680 :       return true;
   17947              :     }
   17948              : 
   17949         9478 :   switch (mode)
   17950              :     {
   17951         1166 :     case E_V2SFmode:
   17952         1166 :     case E_V2SImode:
   17953         1166 :       if (!mmx_ok)
   17954              :         return false;
   17955              :       /* FALLTHRU */
   17956              : 
   17957         8173 :     case E_V2DFmode:
   17958         8173 :     case E_V2DImode:
   17959         8173 :       if (one_var != 0)
   17960              :         return false;
   17961         4986 :       var = force_reg (GET_MODE_INNER (mode), var);
   17962         4986 :       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
   17963         2493 :       emit_insn (gen_rtx_SET (target, x));
   17964         2493 :       return true;
   17965              : 
   17966          294 :     case E_V4SFmode:
   17967          294 :     case E_V4SImode:
   17968          294 :       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
   17969            0 :         new_target = gen_reg_rtx (mode);
   17970              :       else
   17971              :         new_target = target;
   17972          588 :       var = force_reg (GET_MODE_INNER (mode), var);
   17973          294 :       x = gen_rtx_VEC_DUPLICATE (mode, var);
   17974          294 :       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
   17975          294 :       emit_insn (gen_rtx_SET (new_target, x));
   17976          294 :       if (one_var != 0)
   17977              :         {
   17978              :           /* We need to shuffle the value to the correct position, so
   17979              :              create a new pseudo to store the intermediate result.  */
   17980              : 
   17981              :           /* With SSE2, we can use the integer shuffle insns.  */
   17982           41 :           if (mode != V4SFmode && TARGET_SSE2)
   17983              :             {
   17984           28 :               emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
   17985              :                                             const1_rtx,
   17986           28 :                                             GEN_INT (one_var == 1 ? 0 : 1),
   17987           28 :                                             GEN_INT (one_var == 2 ? 0 : 1),
   17988           28 :                                             GEN_INT (one_var == 3 ? 0 : 1)));
   17989           28 :               if (target != new_target)
   17990            0 :                 emit_move_insn (target, new_target);
   17991           28 :               return true;
   17992              :             }
   17993              : 
   17994              :           /* Otherwise convert the intermediate result to V4SFmode and
   17995              :              use the SSE1 shuffle instructions.  */
   17996            0 :           if (mode != V4SFmode)
   17997              :             {
   17998            0 :               tmp = gen_reg_rtx (V4SFmode);
   17999            0 :               emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
   18000              :             }
   18001              :           else
   18002              :             tmp = new_target;
   18003              : 
   18004           43 :           emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
   18005              :                                        const1_rtx,
   18006           13 :                                        GEN_INT (one_var == 1 ? 0 : 1),
   18007              :                                        GEN_INT (one_var == 2 ? 0+4 : 1+4),
   18008              :                                        GEN_INT (one_var == 3 ? 0+4 : 1+4)));
   18009              : 
   18010           13 :           if (mode != V4SFmode)
   18011            0 :             emit_move_insn (target, gen_lowpart (V4SImode, tmp));
   18012           13 :           else if (tmp != target)
   18013            0 :             emit_move_insn (target, tmp);
   18014              :         }
   18015          253 :       else if (target != new_target)
   18016            0 :         emit_move_insn (target, new_target);
   18017              :       return true;
   18018              : 
   18019           13 :     case E_V8HImode:
   18020           13 :     case E_V16QImode:
   18021           13 :       vsimode = V4SImode;
   18022           13 :       goto widen;
   18023            3 :     case E_V4HImode:
   18024            3 :     case E_V8QImode:
   18025            3 :       if (!mmx_ok)
   18026              :         return false;
   18027            3 :       vsimode = V2SImode;
   18028            3 :       goto widen;
   18029           16 :     widen:
   18030           16 :       if (one_var != 0)
   18031              :         return false;
   18032              : 
   18033              :       /* Zero extend the variable element to SImode and recurse.  */
   18034           16 :       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
   18035              : 
   18036            8 :       x = gen_reg_rtx (vsimode);
   18037            8 :       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
   18038              :                                                 var, one_var))
   18039            0 :         gcc_unreachable ();
   18040              : 
   18041            8 :       emit_move_insn (target, gen_lowpart (mode, x));
   18042            8 :       return true;
   18043              : 
   18044              :     default:
   18045              :       return false;
   18046              :     }
   18047              : }
   18048              : 
   18049              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   18050              :    consisting of the values in VALS.  It is known that all elements
   18051              :    except ONE_VAR are constants.  Return true if successful.  */
   18052              : 
   18053              : static bool
   18054         7838 : ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
   18055              :                                  rtx target, rtx vals, int one_var)
   18056              : {
   18057         7838 :   rtx var = XVECEXP (vals, 0, one_var);
   18058         7838 :   machine_mode wmode;
   18059         7838 :   rtx const_vec, x;
   18060              : 
   18061         7838 :   const_vec = copy_rtx (vals);
   18062         7838 :   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
   18063         7838 :   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
   18064              : 
   18065         7838 :   switch (mode)
   18066              :     {
   18067              :     case E_V2DFmode:
   18068              :     case E_V2DImode:
   18069              :     case E_V2SFmode:
   18070              :     case E_V2SImode:
   18071              :       /* For the two element vectors, it's just as easy to use
   18072              :          the general case.  */
   18073              :       return false;
   18074              : 
   18075            3 :     case E_V4DImode:
   18076              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   18077            3 :       if (!TARGET_64BIT)
   18078              :         return false;
   18079              :       /* FALLTHRU */
   18080              :     case E_V8HFmode:
   18081              :     case E_V16HFmode:
   18082              :     case E_V8BFmode:
   18083              :     case E_V16BFmode:
   18084              :     case E_V4DFmode:
   18085              :     case E_V8SFmode:
   18086              :     case E_V8SImode:
   18087              :     case E_V16HImode:
   18088              :     case E_V32QImode:
   18089              :     case E_V4SFmode:
   18090              :     case E_V4SImode:
   18091              :     case E_V8HImode:
   18092              :     case E_V4HImode:
   18093              :     case E_V4HFmode:
   18094              :     case E_V4BFmode:
   18095              :       break;
   18096              : 
   18097            8 :     case E_V16QImode:
   18098            8 :       if (TARGET_SSE4_1)
   18099              :         break;
   18100            8 :       wmode = V8HImode;
   18101            8 :       goto widen;
   18102            1 :     case E_V8QImode:
   18103            1 :       if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
   18104              :         break;
   18105            1 :       wmode = V4HImode;
   18106            1 :       goto widen;
   18107           38 :     case E_V4QImode:
   18108           38 :       if (TARGET_SSE4_1)
   18109              :         break;
   18110              :       wmode = V2HImode;
   18111           47 :     widen:
   18112              :       /* There's no way to set one QImode entry easily.  Combine
   18113              :          the variable value with its adjacent constant value, and
   18114              :          promote to an HImode set.  */
   18115           47 :       x = XVECEXP (vals, 0, one_var ^ 1);
   18116           47 :       if (one_var & 1)
   18117              :         {
   18118           13 :           var = convert_modes (HImode, QImode, var, true);
   18119           13 :           var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
   18120              :                                      NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18121           13 :           x = GEN_INT (INTVAL (x) & 0xff);
   18122              :         }
   18123              :       else
   18124              :         {
   18125           34 :           var = convert_modes (HImode, QImode, var, true);
   18126           34 :           x = gen_int_mode (UINTVAL (x) << 8, HImode);
   18127              :         }
   18128           47 :       if (x != const0_rtx)
   18129            7 :         var = expand_simple_binop (HImode, IOR, var, x, var,
   18130              :                                    1, OPTAB_LIB_WIDEN);
   18131              : 
   18132           47 :       x = gen_reg_rtx (wmode);
   18133           47 :       emit_move_insn (x, gen_lowpart (wmode, const_vec));
   18134           47 :       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
   18135              : 
   18136           47 :       emit_move_insn (target, gen_lowpart (mode, x));
   18137           47 :       return true;
   18138              : 
   18139              :     default:
   18140              :       return false;
   18141              :     }
   18142              : 
   18143          193 :   emit_move_insn (target, const_vec);
   18144          193 :   ix86_expand_vector_set (mmx_ok, target, var, one_var);
   18145          193 :   return true;
   18146              : }
   18147              : 
   18148              : /* A subroutine of ix86_expand_vector_init_general.  Use vector
   18149              :    concatenate to handle the most general case: all values variable,
   18150              :    and none identical.  */
   18151              : 
   18152              : static void
   18153       118383 : ix86_expand_vector_init_concat (machine_mode mode,
   18154              :                                 rtx target, rtx *ops, int n)
   18155              : {
   18156       118383 :   machine_mode half_mode = VOIDmode;
   18157       118383 :   rtx half[2];
   18158       118383 :   rtvec v;
   18159       118383 :   int i, j;
   18160              : 
   18161       118383 :   switch (n)
   18162              :     {
   18163       110234 :     case 2:
   18164       110234 :       switch (mode)
   18165              :         {
   18166              :         case E_V32HFmode:
   18167              :           half_mode = V16HFmode;
   18168              :           break;
   18169            0 :         case E_V32BFmode:
   18170            0 :           half_mode = V16BFmode;
   18171            0 :           break;
   18172           79 :         case E_V16SImode:
   18173           79 :           half_mode = V8SImode;
   18174           79 :           break;
   18175           33 :         case E_V16SFmode:
   18176           33 :           half_mode = V8SFmode;
   18177           33 :           break;
   18178           92 :         case E_V8DImode:
   18179           92 :           half_mode = V4DImode;
   18180           92 :           break;
   18181           59 :         case E_V8DFmode:
   18182           59 :           half_mode = V4DFmode;
   18183           59 :           break;
   18184            0 :         case E_V16HFmode:
   18185            0 :           half_mode = V8HFmode;
   18186            0 :           break;
   18187            0 :         case E_V16BFmode:
   18188            0 :           half_mode = V8BFmode;
   18189            0 :           break;
   18190          191 :         case E_V8SImode:
   18191          191 :           half_mode = V4SImode;
   18192          191 :           break;
   18193          259 :         case E_V8SFmode:
   18194          259 :           half_mode = V4SFmode;
   18195          259 :           break;
   18196          304 :         case E_V4DImode:
   18197          304 :           half_mode = V2DImode;
   18198          304 :           break;
   18199          503 :         case E_V4DFmode:
   18200          503 :           half_mode = V2DFmode;
   18201          503 :           break;
   18202         5808 :         case E_V4SImode:
   18203         5808 :           half_mode = V2SImode;
   18204         5808 :           break;
   18205         2087 :         case E_V4SFmode:
   18206         2087 :           half_mode = V2SFmode;
   18207         2087 :           break;
   18208        65097 :         case E_V2DImode:
   18209        65097 :           half_mode = DImode;
   18210        65097 :           break;
   18211        26919 :         case E_V2SImode:
   18212        26919 :           half_mode = SImode;
   18213        26919 :           break;
   18214         3431 :         case E_V2DFmode:
   18215         3431 :           half_mode = DFmode;
   18216         3431 :           break;
   18217         5372 :         case E_V2SFmode:
   18218         5372 :           half_mode = SFmode;
   18219         5372 :           break;
   18220            0 :         default:
   18221            0 :           gcc_unreachable ();
   18222              :         }
   18223              : 
   18224       110234 :       if (!register_operand (ops[1], half_mode))
   18225        47912 :         ops[1] = force_reg (half_mode, ops[1]);
   18226       110234 :       if (!register_operand (ops[0], half_mode))
   18227        36205 :         ops[0] = force_reg (half_mode, ops[0]);
   18228       110234 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
   18229              :                                                           ops[1])));
   18230       110234 :       break;
   18231              : 
   18232         7541 :     case 4:
   18233         7541 :       switch (mode)
   18234              :         {
   18235              :         case E_V4DImode:
   18236              :           half_mode = V2DImode;
   18237              :           break;
   18238          476 :         case E_V4DFmode:
   18239          476 :           half_mode = V2DFmode;
   18240          476 :           break;
   18241         4875 :         case E_V4SImode:
   18242         4875 :           half_mode = V2SImode;
   18243         4875 :           break;
   18244         2012 :         case E_V4SFmode:
   18245         2012 :           half_mode = V2SFmode;
   18246         2012 :           break;
   18247            0 :         default:
   18248            0 :           gcc_unreachable ();
   18249              :         }
   18250         7541 :       goto half;
   18251              : 
   18252          517 :     case 8:
   18253          517 :       switch (mode)
   18254              :         {
   18255              :         case E_V8DImode:
   18256              :           half_mode = V4DImode;
   18257              :           break;
   18258           59 :         case E_V8DFmode:
   18259           59 :           half_mode = V4DFmode;
   18260           59 :           break;
   18261          154 :         case E_V8SImode:
   18262          154 :           half_mode = V4SImode;
   18263          154 :           break;
   18264          253 :         case E_V8SFmode:
   18265          253 :           half_mode = V4SFmode;
   18266          253 :           break;
   18267            0 :         default:
   18268            0 :           gcc_unreachable ();
   18269              :         }
   18270          517 :       goto half;
   18271              : 
   18272           91 :     case 16:
   18273           91 :       switch (mode)
   18274              :         {
   18275              :         case E_V16SImode:
   18276              :           half_mode = V8SImode;
   18277              :           break;
   18278           33 :         case E_V16SFmode:
   18279           33 :           half_mode = V8SFmode;
   18280           33 :           break;
   18281            0 :         default:
   18282            0 :           gcc_unreachable ();
   18283              :         }
   18284           91 :       goto half;
   18285              : 
   18286         8149 : half:
   18287              :       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
   18288         8149 :       i = n - 1;
   18289        24447 :       for (j = 1; j != -1; j--)
   18290              :         {
   18291        16298 :           half[j] = gen_reg_rtx (half_mode);
   18292        16298 :           switch (n >> 1)
   18293              :             {
   18294        15082 :             case 2:
   18295        15082 :               v = gen_rtvec (2, ops[i-1], ops[i]);
   18296        15082 :               i -= 2;
   18297        15082 :               break;
   18298         1034 :             case 4:
   18299         1034 :               v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
   18300         1034 :               i -= 4;
   18301         1034 :               break;
   18302          182 :             case 8:
   18303          364 :               v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
   18304          182 :                              ops[i-3], ops[i-2], ops[i-1], ops[i]);
   18305          182 :               i -= 8;
   18306          182 :               break;
   18307            0 :             default:
   18308            0 :               gcc_unreachable ();
   18309              :             }
   18310        16298 :           ix86_expand_vector_init (false, half[j],
   18311              :                                    gen_rtx_PARALLEL (half_mode, v));
   18312              :         }
   18313              : 
   18314         8149 :       ix86_expand_vector_init_concat (mode, target, half, 2);
   18315         8149 :       break;
   18316              : 
   18317            0 :     default:
   18318            0 :       gcc_unreachable ();
   18319              :     }
   18320       118383 : }
   18321              : 
   18322              : /* A subroutine of ix86_expand_vector_init_general.  Use vector
   18323              :    interleave to handle the most general case: all values variable,
   18324              :    and none identical.  */
   18325              : 
   18326              : static void
   18327         3881 : ix86_expand_vector_init_interleave (machine_mode mode,
   18328              :                                     rtx target, rtx *ops, int n)
   18329              : {
   18330         3881 :   machine_mode first_imode, second_imode, third_imode, inner_mode;
   18331         3881 :   int i, j;
   18332         3881 :   rtx op, op0, op1;
   18333         3881 :   rtx (*gen_load_even) (rtx, rtx, rtx);
   18334         3881 :   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
   18335         3881 :   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
   18336              : 
   18337         3881 :   switch (mode)
   18338              :     {
   18339              :     case E_V8HFmode:
   18340              :       gen_load_even = gen_vec_interleave_lowv8hf;
   18341              :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18342              :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18343              :       inner_mode = HFmode;
   18344              :       first_imode = V4SImode;
   18345              :       second_imode = V2DImode;
   18346              :       third_imode = VOIDmode;
   18347              :       break;
   18348          487 :     case E_V8BFmode:
   18349          487 :       gen_load_even = gen_vec_interleave_lowv8bf;
   18350          487 :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18351          487 :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18352          487 :       inner_mode = BFmode;
   18353          487 :       first_imode = V4SImode;
   18354          487 :       second_imode = V2DImode;
   18355          487 :       third_imode = VOIDmode;
   18356          487 :       break;
   18357          793 :     case E_V8HImode:
   18358          793 :       gen_load_even = gen_vec_setv8hi;
   18359          793 :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18360          793 :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18361          793 :       inner_mode = HImode;
   18362          793 :       first_imode = V4SImode;
   18363          793 :       second_imode = V2DImode;
   18364          793 :       third_imode = VOIDmode;
   18365          793 :       break;
   18366          374 :     case E_V16QImode:
   18367          374 :       gen_load_even = gen_vec_setv16qi;
   18368          374 :       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
   18369          374 :       gen_interleave_second_low = gen_vec_interleave_lowv4si;
   18370          374 :       inner_mode = QImode;
   18371          374 :       first_imode = V8HImode;
   18372          374 :       second_imode = V4SImode;
   18373          374 :       third_imode = V2DImode;
   18374          374 :       break;
   18375            0 :     default:
   18376            0 :       gcc_unreachable ();
   18377              :     }
   18378              : 
   18379        20901 :   for (i = 0; i < n; i++)
   18380              :     {
   18381        17020 :       op = ops [i + i];
   18382        17020 :       if (inner_mode == HFmode || inner_mode == BFmode)
   18383              :         {
   18384        10856 :           rtx even, odd;
   18385              :           /* Use vpuncklwd to pack 2 HFmode or BFmode.  */
   18386         1948 :           machine_mode vec_mode =
   18387        10856 :             (inner_mode == HFmode) ? V8HFmode : V8BFmode;
   18388        10856 :           op0 = gen_reg_rtx (vec_mode);
   18389        10856 :           even = lowpart_subreg (vec_mode,
   18390              :                                  force_reg (inner_mode, op), inner_mode);
   18391        10856 :           odd = lowpart_subreg (vec_mode,
   18392        10856 :                                 force_reg (inner_mode, ops[i + i + 1]),
   18393              :                                 inner_mode);
   18394        10856 :           emit_insn (gen_load_even (op0, even, odd));
   18395              :         }
   18396              :       else
   18397              :         {
   18398              :           /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
   18399         6164 :           op0 = gen_reg_rtx (SImode);
   18400         6164 :           emit_move_insn (op0, gen_lowpart (SImode, op));
   18401              : 
   18402              :           /* Insert the SImode value as low element of V4SImode vector.  */
   18403         6164 :           op1 = gen_reg_rtx (V4SImode);
   18404         6164 :           op0 = gen_rtx_VEC_MERGE (V4SImode,
   18405              :                                    gen_rtx_VEC_DUPLICATE (V4SImode,
   18406              :                                                           op0),
   18407              :                                    CONST0_RTX (V4SImode),
   18408              :                                    const1_rtx);
   18409         6164 :           emit_insn (gen_rtx_SET (op1, op0));
   18410              : 
   18411              :           /* Cast the V4SImode vector back to a vector in orignal mode.  */
   18412         6164 :           op0 = gen_reg_rtx (mode);
   18413         6164 :           emit_move_insn (op0, gen_lowpart (mode, op1));
   18414              : 
   18415              :           /* Load even elements into the second position.  */
   18416         6164 :           emit_insn (gen_load_even (op0,
   18417              :                                     force_reg (inner_mode,
   18418         6164 :                                                ops[i + i + 1]),
   18419              :                                     const1_rtx));
   18420              :         }
   18421              : 
   18422              :       /* Cast vector to FIRST_IMODE vector.  */
   18423        17020 :       ops[i] = gen_reg_rtx (first_imode);
   18424        17020 :       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
   18425              :     }
   18426              : 
   18427              :   /* Interleave low FIRST_IMODE vectors.  */
   18428        12391 :   for (i = j = 0; i < n; i += 2, j++)
   18429              :     {
   18430         8510 :       op0 = gen_reg_rtx (first_imode);
   18431         8510 :       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
   18432              : 
   18433              :       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
   18434         8510 :       ops[j] = gen_reg_rtx (second_imode);
   18435         8510 :       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
   18436              :     }
   18437              : 
   18438              :   /* Interleave low SECOND_IMODE vectors.  */
   18439         3881 :   switch (second_imode)
   18440              :     {
   18441              :     case E_V4SImode:
   18442         1122 :       for (i = j = 0; i < n / 2; i += 2, j++)
   18443              :         {
   18444          748 :           op0 = gen_reg_rtx (second_imode);
   18445          748 :           emit_insn (gen_interleave_second_low (op0, ops[i],
   18446          748 :                                                 ops[i + 1]));
   18447              : 
   18448              :           /* Cast the SECOND_IMODE vector to the THIRD_IMODE
   18449              :              vector.  */
   18450          748 :           ops[j] = gen_reg_rtx (third_imode);
   18451          748 :           emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
   18452              :         }
   18453              :       second_imode = V2DImode;
   18454              :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18455              :       /* FALLTHRU */
   18456              : 
   18457         3881 :     case E_V2DImode:
   18458         3881 :       op0 = gen_reg_rtx (second_imode);
   18459         3881 :       emit_insn (gen_interleave_second_low (op0, ops[0],
   18460              :                                             ops[1]));
   18461              : 
   18462              :       /* Cast the SECOND_IMODE vector back to a vector on original
   18463              :          mode.  */
   18464         3881 :       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
   18465         3881 :       break;
   18466              : 
   18467              :     default:
   18468              :       gcc_unreachable ();
   18469              :     }
   18470         3881 : }
   18471              : 
   18472              : /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
   18473              :    all values variable, and none identical.  */
   18474              : 
   18475              : static void
   18476       119421 : ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
   18477              :                                  rtx target, rtx vals)
   18478              : {
   18479       119421 :   rtx ops[64], op0, op1, op2, op3, op4, op5;
   18480       119421 :   machine_mode half_mode = VOIDmode;
   18481       119421 :   machine_mode quarter_mode = VOIDmode;
   18482       119421 :   machine_mode int_inner_mode = VOIDmode;
   18483       119421 :   int n, i;
   18484              : 
   18485       119421 :   switch (mode)
   18486              :     {
   18487        32291 :     case E_V2SFmode:
   18488        32291 :     case E_V2SImode:
   18489        32291 :       if (!mmx_ok && !TARGET_SSE)
   18490              :         break;
   18491              :       /* FALLTHRU */
   18492              : 
   18493       108968 :     case E_V16SImode:
   18494       108968 :     case E_V16SFmode:
   18495       108968 :     case E_V8DFmode:
   18496       108968 :     case E_V8DImode:
   18497       108968 :     case E_V8SFmode:
   18498       108968 :     case E_V8SImode:
   18499       108968 :     case E_V4DFmode:
   18500       108968 :     case E_V4DImode:
   18501       108968 :     case E_V4SFmode:
   18502       108968 :     case E_V4SImode:
   18503       108968 :     case E_V2DFmode:
   18504       108968 :     case E_V2DImode:
   18505       108968 :       n = GET_MODE_NUNITS (mode);
   18506       346362 :       for (i = 0; i < n; i++)
   18507       237394 :         ops[i] = XVECEXP (vals, 0, i);
   18508       108968 :       ix86_expand_vector_init_concat (mode, target, ops, n);
   18509       220041 :       return;
   18510              : 
   18511              :     case E_V2TImode:
   18512          135 :       for (i = 0; i < 2; i++)
   18513           90 :         ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
   18514           45 :       op0 = gen_reg_rtx (V4DImode);
   18515           45 :       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
   18516           45 :       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
   18517           45 :       return;
   18518              : 
   18519              :     case E_V4TImode:
   18520          195 :       for (i = 0; i < 4; i++)
   18521          156 :         ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
   18522           39 :       ops[4] = gen_reg_rtx (V4DImode);
   18523           39 :       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
   18524           39 :       ops[5] = gen_reg_rtx (V4DImode);
   18525           39 :       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
   18526           39 :       op0 = gen_reg_rtx (V8DImode);
   18527           39 :       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
   18528           39 :       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
   18529           39 :       return;
   18530              : 
   18531           69 :     case E_V32QImode:
   18532           69 :       half_mode = V16QImode;
   18533           69 :       goto half;
   18534              : 
   18535           64 :     case E_V16HImode:
   18536           64 :       half_mode = V8HImode;
   18537           64 :       goto half;
   18538              : 
   18539          237 :     case E_V16HFmode:
   18540          237 :       half_mode = V8HFmode;
   18541          237 :       goto half;
   18542              : 
   18543           95 :     case E_V16BFmode:
   18544           95 :       half_mode = V8BFmode;
   18545           95 :       goto half;
   18546              : 
   18547          465 : half:
   18548          465 :       n = GET_MODE_NUNITS (mode);
   18549         9009 :       for (i = 0; i < n; i++)
   18550         8544 :         ops[i] = XVECEXP (vals, 0, i);
   18551          465 :       op0 = gen_reg_rtx (half_mode);
   18552          465 :       op1 = gen_reg_rtx (half_mode);
   18553          465 :       ix86_expand_vector_init_interleave (half_mode, op0, ops,
   18554              :                                           n >> 2);
   18555          465 :       ix86_expand_vector_init_interleave (half_mode, op1,
   18556          465 :                                           &ops [n >> 1], n >> 2);
   18557          465 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
   18558          465 :       return;
   18559              : 
   18560           56 :     case E_V64QImode:
   18561           56 :       quarter_mode = V16QImode;
   18562           56 :       half_mode = V32QImode;
   18563           56 :       goto quarter;
   18564              : 
   18565           71 :     case E_V32HImode:
   18566           71 :       quarter_mode = V8HImode;
   18567           71 :       half_mode = V16HImode;
   18568           71 :       goto quarter;
   18569              : 
   18570          287 :     case E_V32HFmode:
   18571          287 :       quarter_mode = V8HFmode;
   18572          287 :       half_mode = V16HFmode;
   18573          287 :       goto quarter;
   18574              : 
   18575           51 :     case E_V32BFmode:
   18576           51 :       quarter_mode = V8BFmode;
   18577           51 :       half_mode = V16BFmode;
   18578           51 :       goto quarter;
   18579              : 
   18580          465 : quarter:
   18581          465 :       n = GET_MODE_NUNITS (mode);
   18582        17137 :       for (i = 0; i < n; i++)
   18583        16672 :         ops[i] = XVECEXP (vals, 0, i);
   18584          465 :       op0 = gen_reg_rtx (quarter_mode);
   18585          465 :       op1 = gen_reg_rtx (quarter_mode);
   18586          465 :       op2 = gen_reg_rtx (quarter_mode);
   18587          465 :       op3 = gen_reg_rtx (quarter_mode);
   18588          465 :       op4 = gen_reg_rtx (half_mode);
   18589          465 :       op5 = gen_reg_rtx (half_mode);
   18590          465 :       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
   18591              :                                           n >> 3);
   18592          465 :       ix86_expand_vector_init_interleave (quarter_mode, op1,
   18593          465 :                                           &ops [n >> 2], n >> 3);
   18594          465 :       ix86_expand_vector_init_interleave (quarter_mode, op2,
   18595          465 :                                           &ops [n >> 1], n >> 3);
   18596          465 :       ix86_expand_vector_init_interleave (quarter_mode, op3,
   18597          465 :                                           &ops [(n >> 1) | (n >> 2)], n >> 3);
   18598          465 :       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
   18599          465 :       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
   18600          465 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
   18601          465 :       return;
   18602              : 
   18603          323 :     case E_V16QImode:
   18604          323 :       if (!TARGET_SSE4_1)
   18605              :         break;
   18606              :       /* FALLTHRU */
   18607              : 
   18608          517 :     case E_V8HImode:
   18609          517 :       if (!TARGET_SSE2)
   18610              :         break;
   18611              : 
   18612              :       /* Don't use ix86_expand_vector_init_interleave if we can't
   18613              :          move from GPR to SSE register directly.  */
   18614          517 :       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
   18615              :         break;
   18616              :       /* FALLTHRU */
   18617              : 
   18618         1091 :     case E_V8HFmode:
   18619         1091 :     case E_V8BFmode:
   18620              : 
   18621         1091 :       n = GET_MODE_NUNITS (mode);
   18622         9915 :       for (i = 0; i < n; i++)
   18623         8824 :         ops[i] = XVECEXP (vals, 0, i);
   18624         1091 :       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
   18625         1091 :       return;
   18626              : 
   18627              :     case E_V4HFmode:
   18628              :     case E_V4BFmode:
   18629              :     case E_V2HFmode:
   18630              :     case E_V2BFmode:
   18631         8348 :       int_inner_mode = HImode;
   18632              :       break;
   18633              : 
   18634              :     case E_V4HImode:
   18635              :     case E_V8QImode:
   18636              : 
   18637              :     case E_V2HImode:
   18638              :     case E_V4QImode:
   18639              :       break;
   18640              : 
   18641            0 :     default:
   18642            0 :       gcc_unreachable ();
   18643              :     }
   18644              : 
   18645         8348 :     {
   18646         8348 :       int i, j, n_elts, n_words, n_elt_per_word;
   18647         8348 :       machine_mode tmp_mode, inner_mode;
   18648         8348 :       rtx words[4], shift;
   18649              : 
   18650        16773 :       tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
   18651              : 
   18652         8348 :       inner_mode = GET_MODE_INNER (mode);
   18653         8348 :       n_elts = GET_MODE_NUNITS (mode);
   18654        16696 :       n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
   18655         8348 :       n_elt_per_word = n_elts / n_words;
   18656         8348 :       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
   18657              : 
   18658        17085 :       for (i = 0; i < n_words; ++i)
   18659              :         {
   18660              :           rtx word = NULL_RTX;
   18661              : 
   18662        46289 :           for (j = 0; j < n_elt_per_word; ++j)
   18663              :             {
   18664        37552 :               rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
   18665        37552 :               if (int_inner_mode != E_VOIDmode)
   18666              :                 {
   18667          138 :                   gcc_assert (TARGET_SSE2 && int_inner_mode == HImode);
   18668          138 :                   rtx tmp = gen_reg_rtx (int_inner_mode);
   18669          138 :                   elt = lowpart_subreg (int_inner_mode,
   18670              :                                         force_reg (inner_mode, elt),
   18671              :                                         inner_mode);
   18672          138 :                   emit_move_insn (tmp, elt);
   18673          138 :                   elt = tmp;
   18674              :                 }
   18675        37552 :               elt = convert_modes (tmp_mode, inner_mode, elt, true);
   18676              : 
   18677        37552 :               if (j == 0)
   18678              :                 word = elt;
   18679              :               else
   18680              :                 {
   18681        28815 :                   word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
   18682              :                                               NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18683        28815 :                   word = expand_simple_binop (tmp_mode, IOR, word, elt,
   18684              :                                               NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18685              :                 }
   18686              :             }
   18687              : 
   18688         8737 :           words[i] = word;
   18689              :         }
   18690              : 
   18691         8348 :       if (n_words == 1)
   18692         7959 :         emit_move_insn (target, gen_lowpart (mode, words[0]));
   18693          389 :       else if (n_words == 2)
   18694              :         {
   18695          389 :           gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
   18696          389 :           machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
   18697          389 :           rtx tmp = gen_reg_rtx (concat_mode);
   18698          389 :           vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
   18699          389 :           ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
   18700          389 :           emit_move_insn (target, gen_lowpart (mode, tmp));
   18701              :         }
   18702            0 :       else if (n_words == 4)
   18703              :         {
   18704            0 :           rtx tmp = gen_reg_rtx (V4SImode);
   18705            0 :           gcc_assert (tmp_mode == SImode);
   18706            0 :           vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
   18707            0 :           ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
   18708            0 :           emit_move_insn (target, gen_lowpart (mode, tmp));
   18709              :         }
   18710              :       else
   18711            0 :         gcc_unreachable ();
   18712              :     }
   18713              : }
   18714              : 
   18715              : /* Initialize vector TARGET via VALS.  Suppress the use of MMX
   18716              :    instructions unless MMX_OK is true.  */
   18717              : 
   18718              : void
   18719       130475 : ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
   18720              : {
   18721       130475 :   machine_mode mode = GET_MODE (target);
   18722       130475 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   18723       130475 :   int n_elts = GET_MODE_NUNITS (mode);
   18724       130475 :   int n_var = 0, one_var = -1;
   18725       130475 :   bool all_same = true, all_const_zero = true;
   18726       130475 :   int i;
   18727       130475 :   rtx x;
   18728              : 
   18729              :   /* Handle first initialization from vector elts.  */
   18730       130475 :   if (n_elts != XVECLEN (vals, 0))
   18731              :     {
   18732         1104 :       rtx subtarget = target;
   18733         1104 :       x = XVECEXP (vals, 0, 0);
   18734         2208 :       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
   18735         2208 :       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
   18736              :         {
   18737         1104 :           rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
   18738         1104 :           if (inner_mode == QImode
   18739         1104 :               || inner_mode == HImode
   18740         1104 :               || inner_mode == TImode
   18741              :               || inner_mode == HFmode
   18742              :               || inner_mode == BFmode)
   18743              :             {
   18744          146 :               unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
   18745          146 :               scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
   18746          146 :               n_bits /= GET_MODE_SIZE (elt_mode);
   18747          146 :               mode = mode_for_vector (elt_mode, n_bits).require ();
   18748          146 :               inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
   18749          146 :               ops[0] = gen_lowpart (inner_mode, ops[0]);
   18750          146 :               ops[1] = gen_lowpart (inner_mode, ops[1]);
   18751          146 :               subtarget = gen_reg_rtx (mode);
   18752              :             }
   18753         1104 :           ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
   18754         1104 :           if (subtarget != target)
   18755          146 :             emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
   18756         1104 :           return;
   18757              :         }
   18758            0 :       gcc_unreachable ();
   18759              :     }
   18760              : 
   18761       475237 :   for (i = 0; i < n_elts; ++i)
   18762              :     {
   18763       345866 :       x = XVECEXP (vals, 0, i);
   18764       671502 :       if (!(CONST_SCALAR_INT_P (x)
   18765       329614 :             || CONST_DOUBLE_P (x)
   18766              :             || CONST_FIXED_P (x)))
   18767       325636 :         n_var++, one_var = i;
   18768        20230 :       else if (x != CONST0_RTX (inner_mode))
   18769         3260 :         all_const_zero = false;
   18770       345866 :       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
   18771              :         all_same = false;
   18772              :     }
   18773              : 
   18774              :   /* Handle the zero vector as special case.  */
   18775       129371 :   if (n_var == 0 && all_const_zero)
   18776              :     {
   18777          302 :       emit_move_insn (target, CONST0_RTX (mode));
   18778          302 :       return;
   18779              :     }
   18780              : 
   18781              :   /* If all values are identical, broadcast the value.  */
   18782       129069 :   if (all_same
   18783       136275 :       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
   18784         7206 :                                             XVECEXP (vals, 0, 0)))
   18785              :     return;
   18786              : 
   18787              :   /* Constants are best loaded from the constant pool.  */
   18788       122957 :   if (n_var == 0)
   18789              :     {
   18790           41 :       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
   18791           41 :       return;
   18792              :     }
   18793              : 
   18794              :   /* Values where only one field is non-constant are best loaded from
   18795              :      the pool and overwritten via move later.  */
   18796       122916 :   if (n_var == 1)
   18797              :     {
   18798        11482 :       if (all_const_zero
   18799        21809 :           && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
   18800        10327 :                                                   XVECEXP (vals, 0, one_var),
   18801              :                                                   one_var))
   18802              :         return;
   18803              : 
   18804         7838 :       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
   18805              :         return;
   18806              :     }
   18807              : 
   18808       119032 :   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
   18809              : }
   18810              : 
   18811              : /* Implemented as
   18812              :    V setg (V v, int idx, T val)
   18813              :    {
   18814              :      V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
   18815              :      V valv = (V){val, val, val, val, val, val, val, val};
   18816              :      V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
   18817              :      v = (v & ~mask) | (valv & mask);
   18818              :      return v;
   18819              :    }.  */
   18820              : void
   18821          129 : ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
   18822              : {
   18823          129 :   rtx vec[64];
   18824          129 :   machine_mode mode = GET_MODE (target);
   18825          129 :   machine_mode cmp_mode = mode;
   18826          129 :   int n_elts = GET_MODE_NUNITS (mode);
   18827          129 :   rtx valv,idxv,constv,idx_tmp;
   18828          129 :   bool ok = false;
   18829              : 
   18830              :   /* 512-bits vector byte/word broadcast and comparison only available
   18831              :      under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
   18832              :      when without TARGET_AVX512BW.  */
   18833          129 :   if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
   18834          123 :        || mode == V64QImode)
   18835           10 :       && !TARGET_AVX512BW)
   18836              :     {
   18837            3 :       gcc_assert (TARGET_AVX512F);
   18838            3 :       rtx vhi, vlo, idx_hi;
   18839            3 :       machine_mode half_mode;
   18840            3 :       rtx (*extract_hi)(rtx, rtx);
   18841            3 :       rtx (*extract_lo)(rtx, rtx);
   18842              : 
   18843            3 :       if (mode == V32HImode)
   18844              :         {
   18845              :           half_mode = V16HImode;
   18846              :           extract_hi = gen_vec_extract_hi_v32hi;
   18847              :           extract_lo = gen_vec_extract_lo_v32hi;
   18848              :         }
   18849              :       else if (mode == V32HFmode)
   18850              :         {
   18851              :           half_mode = V16HFmode;
   18852              :           extract_hi = gen_vec_extract_hi_v32hf;
   18853              :           extract_lo = gen_vec_extract_lo_v32hf;
   18854              :         }
   18855              :       else if (mode == V32BFmode)
   18856              :         {
   18857              :           half_mode = V16BFmode;
   18858              :           extract_hi = gen_vec_extract_hi_v32bf;
   18859              :           extract_lo = gen_vec_extract_lo_v32bf;
   18860              :         }
   18861              :       else
   18862              :         {
   18863            3 :           half_mode = V32QImode;
   18864            3 :           extract_hi = gen_vec_extract_hi_v64qi;
   18865            3 :           extract_lo = gen_vec_extract_lo_v64qi;
   18866              :         }
   18867              : 
   18868            3 :       vhi = gen_reg_rtx (half_mode);
   18869            3 :       vlo = gen_reg_rtx (half_mode);
   18870            3 :       idx_hi = gen_reg_rtx (GET_MODE (idx));
   18871            3 :       emit_insn (extract_hi (vhi, target));
   18872            3 :       emit_insn (extract_lo (vlo, target));
   18873            3 :       vec[0] = idx_hi;
   18874            3 :       vec[1] = idx;
   18875            3 :       vec[2] = GEN_INT (n_elts/2);
   18876            3 :       ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
   18877            3 :       ix86_expand_vector_set_var (vhi, val, idx_hi);
   18878            3 :       ix86_expand_vector_set_var (vlo, val, idx);
   18879            3 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
   18880            3 :       return;
   18881              :     }
   18882              : 
   18883          504 :   if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
   18884              :     {
   18885           42 :       switch (mode)
   18886              :         {
   18887              :         case E_V2DFmode:
   18888              :           cmp_mode = V2DImode;
   18889              :           break;
   18890            6 :         case E_V4DFmode:
   18891            6 :           cmp_mode = V4DImode;
   18892            6 :           break;
   18893            4 :         case E_V8DFmode:
   18894            4 :           cmp_mode = V8DImode;
   18895            4 :           break;
   18896            2 :         case E_V2SFmode:
   18897            2 :           cmp_mode = V2SImode;
   18898            2 :           break;
   18899            6 :         case E_V4SFmode:
   18900            6 :           cmp_mode = V4SImode;
   18901            6 :           break;
   18902            6 :         case E_V8SFmode:
   18903            6 :           cmp_mode = V8SImode;
   18904            6 :           break;
   18905            5 :         case E_V16SFmode:
   18906            5 :           cmp_mode = V16SImode;
   18907            5 :           break;
   18908            1 :         case E_V2HFmode:
   18909            1 :         case E_V2BFmode:
   18910            1 :           cmp_mode = V2HImode;
   18911            1 :           break;
   18912            1 :         case E_V4HFmode:
   18913            1 :         case E_V4BFmode:
   18914            1 :           cmp_mode = V4HImode;
   18915            1 :           break;
   18916              :         case E_V8HFmode:
   18917            2 :           cmp_mode = V8HImode;
   18918              :           break;
   18919              :         case E_V16HFmode:
   18920            2 :           cmp_mode = V16HImode;
   18921              :           break;
   18922              :         case E_V32HFmode:
   18923            1 :           cmp_mode = V32HImode;
   18924              :           break;
   18925              :         case E_V8BFmode:
   18926            2 :           cmp_mode = V8HImode;
   18927              :           break;
   18928              :         case E_V16BFmode:
   18929            2 :           cmp_mode = V16HImode;
   18930              :           break;
   18931              :         case E_V32BFmode:
   18932            1 :           cmp_mode = V32HImode;
   18933              :           break;
   18934            0 :         default:
   18935            0 :           gcc_unreachable ();
   18936              :         }
   18937              :     }
   18938              : 
   18939         1604 :   for (int i = 0; i != n_elts; i++)
   18940         1478 :     vec[i] = GEN_INT (i);
   18941          126 :   constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
   18942          126 :   valv = gen_reg_rtx (mode);
   18943          126 :   idxv = gen_reg_rtx (cmp_mode);
   18944          252 :   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
   18945              : 
   18946          126 :   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
   18947              :                                           mode, valv, val);
   18948          126 :   gcc_assert (ok);
   18949          126 :   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
   18950              :                                           cmp_mode, idxv, idx_tmp);
   18951          126 :   gcc_assert (ok);
   18952          126 :   vec[0] = target;
   18953          126 :   vec[1] = valv;
   18954          126 :   vec[2] = target;
   18955          126 :   vec[3] = gen_rtx_EQ (mode, idxv, constv);
   18956          126 :   vec[4] = idxv;
   18957          126 :   vec[5] = constv;
   18958          126 :   ok = ix86_expand_int_vcond (vec);
   18959          126 :   gcc_assert (ok);
   18960              : }
   18961              : 
   18962              : void
   18963         8340 : ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
   18964              : {
   18965         8340 :   machine_mode mode = GET_MODE (target);
   18966         8340 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   18967         8340 :   machine_mode half_mode;
   18968         8340 :   bool use_vec_merge = false;
   18969         8340 :   bool blendm_const = false;
   18970         8340 :   rtx tmp;
   18971         8340 :   static rtx (*gen_extract[8][2]) (rtx, rtx)
   18972              :     = {
   18973              :         { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
   18974              :         { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
   18975              :         { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
   18976              :         { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
   18977              :         { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
   18978              :         { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
   18979              :         { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
   18980              :         { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
   18981              :       };
   18982         8340 :   static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
   18983              :     = {
   18984              :         { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
   18985              :         { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
   18986              :         { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
   18987              :         { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
   18988              :         { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
   18989              :         { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
   18990              :         { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
   18991              :         { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
   18992              :       };
   18993         8340 :   int i, j, n;
   18994         8340 :   machine_mode mmode = VOIDmode;
   18995         8340 :   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
   18996              : 
   18997         8340 :   switch (mode)
   18998              :     {
   18999          188 :     case E_V2SImode:
   19000          188 :       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19001              :       if (use_vec_merge)
   19002              :         break;
   19003              :       /* FALLTHRU */
   19004              : 
   19005          167 :     case E_V2SFmode:
   19006          167 :       if (mmx_ok)
   19007              :         {
   19008          165 :           tmp = gen_reg_rtx (GET_MODE_INNER (mode));
   19009          165 :           ix86_expand_vector_extract (true, tmp, target, 1 - elt);
   19010          165 :           if (elt == 0)
   19011            0 :             tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
   19012              :           else
   19013          165 :             tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
   19014          165 :           emit_insn (gen_rtx_SET (target, tmp));
   19015          165 :           return;
   19016              :         }
   19017              :       break;
   19018              : 
   19019          220 :     case E_V2DImode:
   19020          220 :       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
   19021           74 :       if (use_vec_merge)
   19022              :         break;
   19023              : 
   19024           74 :       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
   19025           74 :       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
   19026           74 :       if (elt == 0)
   19027           49 :         tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
   19028              :       else
   19029           25 :         tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
   19030           74 :       emit_insn (gen_rtx_SET (target, tmp));
   19031           74 :       return;
   19032              : 
   19033          130 :     case E_V2DFmode:
   19034              :       /* NB: For ELT == 0, use standard scalar operation patterns which
   19035              :          preserve the rest of the vector for combiner:
   19036              : 
   19037              :          (vec_merge:V2DF
   19038              :            (vec_duplicate:V2DF (reg:DF))
   19039              :            (reg:V2DF)
   19040              :            (const_int 1))
   19041              :        */
   19042          130 :       if (elt == 0)
   19043           68 :         goto do_vec_merge;
   19044              : 
   19045           62 :       {
   19046           62 :         rtx op0, op1;
   19047              : 
   19048              :         /* For the two element vectors, we implement a VEC_CONCAT with
   19049              :            the extraction of the other element.  */
   19050              : 
   19051           62 :         tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
   19052           62 :         tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
   19053              : 
   19054           62 :         if (elt == 0)
   19055              :           op0 = val, op1 = tmp;
   19056              :         else
   19057           62 :           op0 = tmp, op1 = val;
   19058              : 
   19059           62 :         tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
   19060           62 :         emit_insn (gen_rtx_SET (target, tmp));
   19061              :       }
   19062           62 :       return;
   19063              : 
   19064          574 :     case E_V4SFmode:
   19065          574 :       use_vec_merge = TARGET_SSE4_1;
   19066          574 :       if (use_vec_merge)
   19067              :         break;
   19068              : 
   19069           62 :       switch (elt)
   19070              :         {
   19071              :         case 0:
   19072              :           use_vec_merge = true;
   19073              :           break;
   19074              : 
   19075            1 :         case 1:
   19076              :           /* tmp = target = A B C D */
   19077            1 :           tmp = copy_to_reg (target);
   19078              :           /* target = A A B B */
   19079            1 :           emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
   19080              :           /* target = X A B B */
   19081            1 :           ix86_expand_vector_set (false, target, val, 0);
   19082              :           /* target = A X C D  */
   19083            1 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19084              :                                           const1_rtx, const0_rtx,
   19085              :                                           GEN_INT (2+4), GEN_INT (3+4)));
   19086            1 :           return;
   19087              : 
   19088            0 :         case 2:
   19089              :           /* tmp = target = A B C D */
   19090            0 :           tmp = copy_to_reg (target);
   19091              :           /* tmp = X B C D */
   19092            0 :           ix86_expand_vector_set (false, tmp, val, 0);
   19093              :           /* target = A B X D */
   19094            0 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19095              :                                           const0_rtx, const1_rtx,
   19096              :                                           GEN_INT (0+4), GEN_INT (3+4)));
   19097            0 :           return;
   19098              : 
   19099            4 :         case 3:
   19100              :           /* tmp = target = A B C D */
   19101            4 :           tmp = copy_to_reg (target);
   19102              :           /* tmp = X B C D */
   19103            4 :           ix86_expand_vector_set (false, tmp, val, 0);
   19104              :           /* target = A B X D */
   19105            4 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19106              :                                           const0_rtx, const1_rtx,
   19107              :                                           GEN_INT (2+4), GEN_INT (0+4)));
   19108            4 :           return;
   19109              : 
   19110            0 :         default:
   19111            0 :           gcc_unreachable ();
   19112              :         }
   19113              :       break;
   19114              : 
   19115          437 :     case E_V4SImode:
   19116          437 :       use_vec_merge = TARGET_SSE4_1;
   19117          437 :       if (use_vec_merge)
   19118              :         break;
   19119              : 
   19120              :       /* Element 0 handled by vec_merge below.  */
   19121          277 :       if (elt == 0)
   19122              :         {
   19123              :           use_vec_merge = true;
   19124              :           break;
   19125              :         }
   19126              : 
   19127           87 :       if (TARGET_SSE2)
   19128              :         {
   19129              :           /* With SSE2, use integer shuffles to swap element 0 and ELT,
   19130              :              store into element 0, then shuffle them back.  */
   19131              : 
   19132           87 :           rtx order[4];
   19133              : 
   19134           87 :           order[0] = GEN_INT (elt);
   19135           87 :           order[1] = const1_rtx;
   19136           87 :           order[2] = const2_rtx;
   19137           87 :           order[3] = GEN_INT (3);
   19138           87 :           order[elt] = const0_rtx;
   19139              : 
   19140           87 :           emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
   19141              :                                         order[1], order[2], order[3]));
   19142              : 
   19143           87 :           ix86_expand_vector_set (false, target, val, 0);
   19144              : 
   19145           87 :           emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
   19146              :                                         order[1], order[2], order[3]));
   19147              :         }
   19148              :       else
   19149              :         {
   19150              :           /* For SSE1, we have to reuse the V4SF code.  */
   19151            0 :           rtx t = gen_reg_rtx (V4SFmode);
   19152            0 :           emit_move_insn (t, gen_lowpart (V4SFmode, target));
   19153            0 :           ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
   19154            0 :           emit_move_insn (target, gen_lowpart (mode, t));
   19155              :         }
   19156              :       return;
   19157              : 
   19158         3534 :     case E_V8HImode:
   19159         3534 :     case E_V8HFmode:
   19160         3534 :     case E_V8BFmode:
   19161         3534 :     case E_V2HImode:
   19162         3534 :     case E_V2HFmode:
   19163         3534 :     case E_V2BFmode:
   19164         3534 :       use_vec_merge = TARGET_SSE2;
   19165         3534 :       break;
   19166           50 :     case E_V4HImode:
   19167           50 :     case E_V4HFmode:
   19168           50 :     case E_V4BFmode:
   19169           50 :       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
   19170              :       break;
   19171              : 
   19172         3067 :     case E_V16QImode:
   19173         3067 :     case E_V4QImode:
   19174         3067 :       use_vec_merge = TARGET_SSE4_1;
   19175         3067 :       break;
   19176              : 
   19177            1 :     case E_V8QImode:
   19178            1 :       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19179              :       break;
   19180              : 
   19181            3 :     case E_V32QImode:
   19182            3 :       half_mode = V16QImode;
   19183            3 :       j = 0;
   19184            3 :       n = 16;
   19185            3 :       goto half;
   19186              : 
   19187           17 :     case E_V16HFmode:
   19188           17 :     case E_V16BFmode:
   19189              :       /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw.  */
   19190           17 :       if (TARGET_AVX2 && elt != 0)
   19191              :         {
   19192           12 :           mmode = SImode;
   19193           12 :           gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
   19194              :                                                 : gen_avx2_pblendbf_1);
   19195              :           blendm_const = true;
   19196              :           break;
   19197              :         }
   19198              :       else
   19199              :         {
   19200            5 :           half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
   19201            3 :           j = ((mode == E_V16HFmode) ? 6 : 7);
   19202            5 :           n = 8;
   19203            5 :           goto half;
   19204              :         }
   19205              : 
   19206            5 :     case E_V16HImode:
   19207            5 :       half_mode = V8HImode;
   19208            5 :       j = 1;
   19209            5 :       n = 8;
   19210            5 :       goto half;
   19211              : 
   19212           15 :     case E_V8SImode:
   19213           15 :       half_mode = V4SImode;
   19214           15 :       j = 2;
   19215           15 :       n = 4;
   19216           15 :       goto half;
   19217              : 
   19218           15 :     case E_V4DImode:
   19219           15 :       half_mode = V2DImode;
   19220           15 :       j = 3;
   19221           15 :       n = 2;
   19222           15 :       goto half;
   19223              : 
   19224            4 :     case E_V8SFmode:
   19225            4 :       half_mode = V4SFmode;
   19226            4 :       j = 4;
   19227            4 :       n = 4;
   19228            4 :       goto half;
   19229              : 
   19230            6 :     case E_V4DFmode:
   19231            6 :       half_mode = V2DFmode;
   19232            6 :       j = 5;
   19233            6 :       n = 2;
   19234            6 :       goto half;
   19235              : 
   19236           53 : half:
   19237              :       /* Compute offset.  */
   19238           53 :       i = elt / n;
   19239           53 :       elt %= n;
   19240              : 
   19241           53 :       gcc_assert (i <= 1);
   19242              : 
   19243              :       /* Extract the half.  */
   19244           53 :       tmp = gen_reg_rtx (half_mode);
   19245           53 :       emit_insn (gen_extract[j][i] (tmp, target));
   19246              : 
   19247              :       /* Put val in tmp at elt.  */
   19248           53 :       ix86_expand_vector_set (false, tmp, val, elt);
   19249              : 
   19250              :       /* Put it back.  */
   19251           53 :       emit_insn (gen_insert[j][i] (target, target, tmp));
   19252           53 :       return;
   19253              : 
   19254            8 :     case E_V8DFmode:
   19255            8 :       if (TARGET_AVX512F)
   19256              :         {
   19257              :           mmode = QImode;
   19258              :           gen_blendm = gen_avx512f_blendmv8df;
   19259              :         }
   19260              :       break;
   19261              : 
   19262            6 :     case E_V8DImode:
   19263            6 :       if (TARGET_AVX512F)
   19264              :         {
   19265              :           mmode = QImode;
   19266              :           gen_blendm = gen_avx512f_blendmv8di;
   19267              :         }
   19268              :       break;
   19269              : 
   19270            0 :     case E_V16SFmode:
   19271            0 :       if (TARGET_AVX512F)
   19272              :         {
   19273              :           mmode = HImode;
   19274              :           gen_blendm = gen_avx512f_blendmv16sf;
   19275              :         }
   19276              :       break;
   19277              : 
   19278            0 :     case E_V16SImode:
   19279            0 :       if (TARGET_AVX512F)
   19280              :         {
   19281              :           mmode = HImode;
   19282              :           gen_blendm = gen_avx512f_blendmv16si;
   19283              :         }
   19284              :       break;
   19285              : 
   19286           12 :     case E_V32HFmode:
   19287           12 :       if (TARGET_AVX512BW)
   19288              :         {
   19289              :           mmode = SImode;
   19290              :           gen_blendm = gen_avx512bw_blendmv32hf;
   19291              :         }
   19292              :       break;
   19293           12 :     case E_V32BFmode:
   19294           12 :       if (TARGET_AVX512BW)
   19295              :         {
   19296              :           mmode = SImode;
   19297              :           gen_blendm = gen_avx512bw_blendmv32bf;
   19298              :         }
   19299              :       break;
   19300           11 :     case E_V32HImode:
   19301           11 :       if (TARGET_AVX512BW)
   19302              :         {
   19303              :           mmode = SImode;
   19304              :           gen_blendm = gen_avx512bw_blendmv32hi;
   19305              :         }
   19306            7 :       else if (TARGET_AVX512F)
   19307              :         {
   19308            7 :           half_mode = E_V8HImode;
   19309            7 :           n = 8;
   19310            7 :           goto quarter;
   19311              :         }
   19312              :       break;
   19313              : 
   19314           12 :     case E_V64QImode:
   19315           12 :       if (TARGET_AVX512BW)
   19316              :         {
   19317              :           mmode = DImode;
   19318              :           gen_blendm = gen_avx512bw_blendmv64qi;
   19319              :         }
   19320            6 :       else if (TARGET_AVX512F)
   19321              :         {
   19322            6 :           half_mode = E_V16QImode;
   19323            6 :           n = 16;
   19324            6 :           goto quarter;
   19325              :         }
   19326              :       break;
   19327              : 
   19328           13 : quarter:
   19329              :       /* Compute offset.  */
   19330           13 :       i = elt / n;
   19331           13 :       elt %= n;
   19332              : 
   19333           13 :       gcc_assert (i <= 3);
   19334              : 
   19335           13 :       {
   19336              :         /* Extract the quarter.  */
   19337           13 :         tmp = gen_reg_rtx (V4SImode);
   19338           13 :         rtx tmp2 = gen_lowpart (V16SImode, target);
   19339           13 :         rtx mask = gen_reg_rtx (QImode);
   19340              : 
   19341           13 :         emit_move_insn (mask, constm1_rtx);
   19342           13 :         emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
   19343              :                                                    tmp, mask));
   19344              : 
   19345           13 :         tmp2 = gen_reg_rtx (half_mode);
   19346           13 :         emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
   19347           13 :         tmp = tmp2;
   19348              : 
   19349              :         /* Put val in tmp at elt.  */
   19350           13 :         ix86_expand_vector_set (false, tmp, val, elt);
   19351              : 
   19352              :         /* Put it back.  */
   19353           13 :         tmp2 = gen_reg_rtx (V16SImode);
   19354           13 :         rtx tmp3 = gen_lowpart (V16SImode, target);
   19355           13 :         mask = gen_reg_rtx (HImode);
   19356           13 :         emit_move_insn (mask, constm1_rtx);
   19357           13 :         tmp = gen_lowpart (V4SImode, tmp);
   19358           13 :         emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
   19359              :                                                   tmp3, mask));
   19360           13 :         emit_move_insn (target, gen_lowpart (mode, tmp2));
   19361              :       }
   19362           13 :       return;
   19363              : 
   19364              :     default:
   19365              :       break;
   19366              :     }
   19367              : 
   19368         6601 :   if (mmode != VOIDmode)
   19369              :     {
   19370           54 :       tmp = gen_reg_rtx (mode);
   19371           54 :       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
   19372           54 :       rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
   19373              :       /* The avx512*_blendm<mode> expanders have different operand order
   19374              :          from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
   19375              :          elements where the mask is set and second input operand otherwise,
   19376              :          in {sse,avx}*_*blend* the first input operand is used for elements
   19377              :          where the mask is clear and second input operand otherwise.  */
   19378           54 :       if (!blendm_const)
   19379           42 :         merge_mask = force_reg (mmode, merge_mask);
   19380           54 :       emit_insn (gen_blendm (target, target, tmp, merge_mask));
   19381              :     }
   19382         7759 :   else if (use_vec_merge)
   19383              :     {
   19384         7747 : do_vec_merge:
   19385         7815 :       if (!nonimmediate_operand (val, inner_mode))
   19386            1 :         val = force_reg (inner_mode, val);
   19387         7815 :       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
   19388         7815 :       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
   19389              :                                GEN_INT (HOST_WIDE_INT_1U << elt));
   19390         7815 :       emit_insn (gen_rtx_SET (target, tmp));
   19391              :     }
   19392              :   else
   19393              :     {
   19394           24 :       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   19395              : 
   19396           12 :       emit_move_insn (mem, target);
   19397              : 
   19398           24 :       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
   19399           12 :       emit_move_insn (tmp, val);
   19400              : 
   19401           12 :       emit_move_insn (target, mem);
   19402              :     }
   19403              : }
   19404              : 
   19405              : void
   19406       108433 : ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
   19407              : {
   19408       108433 :   machine_mode mode = GET_MODE (vec);
   19409       108433 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   19410       108433 :   bool use_vec_extr = false;
   19411       108433 :   rtx tmp;
   19412              : 
   19413       108433 :   switch (mode)
   19414              :     {
   19415         8478 :     case E_V2SImode:
   19416         8478 :       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19417              :       if (use_vec_extr)
   19418              :         break;
   19419              :       /* FALLTHRU */
   19420              : 
   19421         9356 :     case E_V2SFmode:
   19422         9356 :       if (!mmx_ok)
   19423              :         break;
   19424              :       /* FALLTHRU */
   19425              : 
   19426              :     case E_V2DFmode:
   19427              :     case E_V2DImode:
   19428              :     case E_V2TImode:
   19429              :     case E_V4TImode:
   19430              :       use_vec_extr = true;
   19431              :       break;
   19432              : 
   19433         7904 :     case E_V4SFmode:
   19434         7904 :       use_vec_extr = TARGET_SSE4_1;
   19435         7904 :       if (use_vec_extr)
   19436              :         break;
   19437              : 
   19438         4035 :       switch (elt)
   19439              :         {
   19440              :         case 0:
   19441              :           tmp = vec;
   19442              :           break;
   19443              : 
   19444         1675 :         case 1:
   19445         1675 :         case 3:
   19446         1675 :           tmp = gen_reg_rtx (mode);
   19447         1675 :           emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
   19448              :                                        GEN_INT (elt), GEN_INT (elt),
   19449         1675 :                                        GEN_INT (elt+4), GEN_INT (elt+4)));
   19450         1675 :           break;
   19451              : 
   19452          931 :         case 2:
   19453          931 :           tmp = gen_reg_rtx (mode);
   19454          931 :           emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
   19455          931 :           break;
   19456              : 
   19457            0 :         default:
   19458            0 :           gcc_unreachable ();
   19459              :         }
   19460              :       vec = tmp;
   19461              :       use_vec_extr = true;
   19462              :       elt = 0;
   19463              :       break;
   19464              : 
   19465        23564 :     case E_V4SImode:
   19466        23564 :       use_vec_extr = TARGET_SSE4_1;
   19467        23564 :       if (use_vec_extr)
   19468              :         break;
   19469              : 
   19470        17883 :       if (TARGET_SSE2)
   19471              :         {
   19472        17879 :           switch (elt)
   19473              :             {
   19474              :             case 0:
   19475              :               tmp = vec;
   19476              :               break;
   19477              : 
   19478         5848 :             case 1:
   19479         5848 :             case 3:
   19480         5848 :               tmp = gen_reg_rtx (mode);
   19481         5848 :               emit_insn (gen_sse2_pshufd_1 (tmp, vec,
   19482              :                                             GEN_INT (elt), GEN_INT (elt),
   19483              :                                             GEN_INT (elt), GEN_INT (elt)));
   19484         5848 :               break;
   19485              : 
   19486         2906 :             case 2:
   19487         2906 :               tmp = gen_reg_rtx (mode);
   19488         2906 :               emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
   19489         2906 :               break;
   19490              : 
   19491            0 :             default:
   19492            0 :               gcc_unreachable ();
   19493              :             }
   19494              :           vec = tmp;
   19495              :           use_vec_extr = true;
   19496              :           elt = 0;
   19497              :         }
   19498              :       else
   19499              :         {
   19500              :           /* For SSE1, we have to reuse the V4SF code.  */
   19501            4 :           ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
   19502            4 :                                       gen_lowpart (V4SFmode, vec), elt);
   19503            4 :           return;
   19504              :         }
   19505              :       break;
   19506              : 
   19507         6488 :     case E_V8HImode:
   19508         6488 :     case E_V8HFmode:
   19509         6488 :     case E_V8BFmode:
   19510         6488 :     case E_V2HImode:
   19511         6488 :     case E_V2HFmode:
   19512         6488 :     case E_V2BFmode:
   19513         6488 :       use_vec_extr = TARGET_SSE2;
   19514         6488 :       break;
   19515          858 :     case E_V4HImode:
   19516          858 :     case E_V4HFmode:
   19517          858 :     case E_V4BFmode:
   19518          858 :       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
   19519              :       break;
   19520              : 
   19521         7809 :     case E_V16QImode:
   19522         7809 :       use_vec_extr = TARGET_SSE4_1;
   19523         7809 :       if (!use_vec_extr
   19524         6223 :           && TARGET_SSE2
   19525         6223 :           && elt == 0
   19526        11683 :           && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
   19527              :         {
   19528         3873 :           tmp = gen_reg_rtx (SImode);
   19529         3873 :           ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
   19530              :                                       0);
   19531         3873 :           emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
   19532         3873 :           return;
   19533              :         }
   19534              :       break;
   19535           78 :     case E_V4QImode:
   19536           78 :       use_vec_extr = TARGET_SSE4_1;
   19537           78 :       break;
   19538              : 
   19539          663 :     case E_V8SFmode:
   19540          663 :       if (TARGET_AVX)
   19541              :         {
   19542          663 :           tmp = gen_reg_rtx (V4SFmode);
   19543          663 :           if (elt < 4)
   19544          326 :             emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
   19545              :           else
   19546          337 :             emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
   19547          663 :           ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19548          663 :           return;
   19549              :         }
   19550              :       break;
   19551              : 
   19552          578 :     case E_V4DFmode:
   19553          578 :       if (TARGET_AVX)
   19554              :         {
   19555          578 :           tmp = gen_reg_rtx (V2DFmode);
   19556          578 :           if (elt < 2)
   19557          303 :             emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
   19558              :           else
   19559          275 :             emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
   19560          578 :           ix86_expand_vector_extract (false, target, tmp, elt & 1);
   19561          578 :           return;
   19562              :         }
   19563              :       break;
   19564              : 
   19565          253 :     case E_V32QImode:
   19566          253 :       if (TARGET_AVX)
   19567              :         {
   19568          253 :           tmp = gen_reg_rtx (V16QImode);
   19569          253 :           if (elt < 16)
   19570          130 :             emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
   19571              :           else
   19572          123 :             emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
   19573          253 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19574          253 :           return;
   19575              :         }
   19576              :       break;
   19577              : 
   19578          616 :     case E_V16HImode:
   19579          616 :       if (TARGET_AVX)
   19580              :         {
   19581          616 :           tmp = gen_reg_rtx (V8HImode);
   19582          616 :           if (elt < 8)
   19583          304 :             emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
   19584              :           else
   19585          312 :             emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
   19586          616 :           ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19587          616 :           return;
   19588              :         }
   19589              :       break;
   19590              : 
   19591          993 :     case E_V8SImode:
   19592          993 :       if (TARGET_AVX)
   19593              :         {
   19594          993 :           tmp = gen_reg_rtx (V4SImode);
   19595          993 :           if (elt < 4)
   19596          479 :             emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
   19597              :           else
   19598          514 :             emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
   19599          993 :           ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19600          993 :           return;
   19601              :         }
   19602              :       break;
   19603              : 
   19604         1518 :     case E_V4DImode:
   19605         1518 :       if (TARGET_AVX)
   19606              :         {
   19607         1518 :           tmp = gen_reg_rtx (V2DImode);
   19608         1518 :           if (elt < 2)
   19609          813 :             emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
   19610              :           else
   19611          705 :             emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
   19612         1518 :           ix86_expand_vector_extract (false, target, tmp, elt & 1);
   19613         1518 :           return;
   19614              :         }
   19615              :       break;
   19616              : 
   19617            8 :     case E_V32HImode:
   19618            8 :       if (TARGET_AVX512BW)
   19619              :         {
   19620            8 :           tmp = gen_reg_rtx (V16HImode);
   19621            8 :           if (elt < 16)
   19622            3 :             emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
   19623              :           else
   19624            5 :             emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
   19625            8 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19626            8 :           return;
   19627              :         }
   19628              :       break;
   19629              : 
   19630           10 :     case E_V64QImode:
   19631           10 :       if (TARGET_AVX512BW)
   19632              :         {
   19633           10 :           tmp = gen_reg_rtx (V32QImode);
   19634           10 :           if (elt < 32)
   19635            5 :             emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
   19636              :           else
   19637            5 :             emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
   19638           10 :           ix86_expand_vector_extract (false, target, tmp, elt & 31);
   19639           10 :           return;
   19640              :         }
   19641              :       break;
   19642              : 
   19643          311 :     case E_V16SFmode:
   19644          311 :       tmp = gen_reg_rtx (V8SFmode);
   19645          311 :       if (elt < 8)
   19646          157 :         emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
   19647              :       else
   19648          154 :         emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
   19649          311 :       ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19650          311 :       return;
   19651              : 
   19652          296 :     case E_V8DFmode:
   19653          296 :       tmp = gen_reg_rtx (V4DFmode);
   19654          296 :       if (elt < 4)
   19655          160 :         emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
   19656              :       else
   19657          136 :         emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
   19658          296 :       ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19659          296 :       return;
   19660              : 
   19661          252 :     case E_V16SImode:
   19662          252 :       tmp = gen_reg_rtx (V8SImode);
   19663          252 :       if (elt < 8)
   19664          133 :         emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
   19665              :       else
   19666          119 :         emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
   19667          252 :       ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19668          252 :       return;
   19669              : 
   19670          706 :     case E_V8DImode:
   19671          706 :       tmp = gen_reg_rtx (V4DImode);
   19672          706 :       if (elt < 4)
   19673          403 :         emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
   19674              :       else
   19675          303 :         emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
   19676          706 :       ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19677          706 :       return;
   19678              : 
   19679           45 :     case E_V32HFmode:
   19680           45 :     case E_V32BFmode:
   19681           45 :       if (TARGET_AVX512BW)
   19682              :         {
   19683           45 :           tmp = (mode == E_V32HFmode
   19684           45 :                  ? gen_reg_rtx (V16HFmode)
   19685            7 :                  : gen_reg_rtx (V16BFmode));
   19686           45 :           if (elt < 16)
   19687           31 :             emit_insn (gen_vec_extract_lo (mode, tmp, vec));
   19688              :           else
   19689           14 :             emit_insn (gen_vec_extract_hi (mode, tmp, vec));
   19690           45 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19691           45 :           return;
   19692              :         }
   19693              :       break;
   19694              : 
   19695          474 :     case E_V16HFmode:
   19696          474 :     case E_V16BFmode:
   19697          474 :       if (TARGET_AVX)
   19698              :         {
   19699          474 :           tmp = (mode == E_V16HFmode
   19700          474 :                  ? gen_reg_rtx (V8HFmode)
   19701          339 :                  : gen_reg_rtx (V8BFmode));
   19702          474 :           if (elt < 8)
   19703          249 :             emit_insn (gen_vec_extract_lo (mode, tmp, vec));
   19704              :           else
   19705          225 :             emit_insn (gen_vec_extract_hi (mode, tmp, vec));
   19706          474 :           ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19707          474 :           return;
   19708              :         }
   19709              :       break;
   19710              : 
   19711          630 :     case E_V8QImode:
   19712          630 :       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19713              :       /* ??? Could extract the appropriate HImode element and shift.  */
   19714              :       break;
   19715              : 
   19716              :     default:
   19717              :       break;
   19718              :     }
   19719              : 
   19720        26397 :   if (use_vec_extr)
   19721              :     {
   19722        90289 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
   19723        90289 :       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
   19724              : 
   19725              :       /* Let the rtl optimizers know about the zero extension performed.  */
   19726        90289 :       if (inner_mode == QImode || inner_mode == HImode)
   19727              :         {
   19728         8484 :           rtx reg = gen_reg_rtx (SImode);
   19729         8484 :           tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
   19730         8484 :           emit_move_insn (reg, tmp);
   19731         8484 :           tmp = gen_lowpart (inner_mode, reg);
   19732         8484 :           SUBREG_PROMOTED_VAR_P (tmp) = 1;
   19733         8484 :           SUBREG_PROMOTED_SET (tmp, 1);
   19734              :         }
   19735              : 
   19736        90289 :       emit_move_insn (target, tmp);
   19737              :     }
   19738              :   else
   19739              :     {
   19740        15088 :       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   19741              : 
   19742         7544 :       emit_move_insn (mem, vec);
   19743              : 
   19744        15088 :       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
   19745         7544 :       emit_move_insn (target, tmp);
   19746              :     }
   19747              : }
   19748              : 
   19749              : /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
   19750              :    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
   19751              :    The upper bits of DEST are undefined, though they shouldn't cause
   19752              :    exceptions (some bits from src or all zeros are ok).  */
   19753              : 
   19754              : static void
   19755        41449 : emit_reduc_half (rtx dest, rtx src, int i)
   19756              : {
   19757        41449 :   rtx tem, d = dest;
   19758        41449 :   switch (GET_MODE (src))
   19759              :     {
   19760         5970 :     case E_V4SFmode:
   19761         5970 :       if (i == 128)
   19762         2985 :         tem = gen_sse_movhlps (dest, src, src);
   19763              :       else
   19764         2985 :         tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
   19765              :                                    GEN_INT (1 + 4), GEN_INT (1 + 4));
   19766              :       break;
   19767         3333 :     case E_V2DFmode:
   19768         3333 :       tem = gen_vec_interleave_highv2df (dest, src, src);
   19769         3333 :       break;
   19770           76 :     case E_V4QImode:
   19771           76 :       d = gen_reg_rtx (V1SImode);
   19772           76 :       tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
   19773           76 :                                GEN_INT (i / 2));
   19774           76 :       break;
   19775          600 :     case E_V8QImode:
   19776          600 :     case E_V4HImode:
   19777          600 :       d = gen_reg_rtx (V1DImode);
   19778          600 :       tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
   19779          600 :                                GEN_INT (i / 2));
   19780          600 :       break;
   19781        31470 :     case E_V16QImode:
   19782        31470 :     case E_V8HImode:
   19783        31470 :     case E_V8HFmode:
   19784        31470 :     case E_V4SImode:
   19785        31470 :     case E_V2DImode:
   19786        31470 :       if (TARGET_SSE_REDUCTION_PREFER_PSHUF)
   19787              :         {
   19788           15 :           if (i == 128)
   19789              :             {
   19790            9 :               d = gen_reg_rtx (V4SImode);
   19791           18 :               tem = gen_sse2_pshufd_1 (
   19792            9 :                   d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
   19793              :                   GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3));
   19794            9 :               break;
   19795              :             }
   19796            6 :           else if (i == 64)
   19797              :             {
   19798            5 :               d = gen_reg_rtx (V4SImode);
   19799           10 :               tem = gen_sse2_pshufd_1 (
   19800            5 :                   d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
   19801              :                   GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
   19802            5 :               break;
   19803              :             }
   19804            1 :           else if (i == 32)
   19805              :             {
   19806            1 :               d = gen_reg_rtx (V8HImode);
   19807            2 :               tem = gen_sse2_pshuflw_1 (
   19808            1 :                   d, force_reg (V8HImode, gen_lowpart (V8HImode, src)),
   19809              :                   GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
   19810            1 :               break;
   19811              :             }
   19812              :         }
   19813        31455 :       d = gen_reg_rtx (V1TImode);
   19814        31455 :       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
   19815        31455 :                                 GEN_INT (i / 2));
   19816        31455 :       break;
   19817            0 :     case E_V8SFmode:
   19818            0 :       if (i == 256)
   19819            0 :         tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
   19820              :       else
   19821            0 :         tem = gen_avx_shufps256 (dest, src, src,
   19822              :                                  GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
   19823              :       break;
   19824            0 :     case E_V4DFmode:
   19825            0 :       if (i == 256)
   19826            0 :         tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
   19827              :       else
   19828            0 :         tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
   19829              :       break;
   19830            0 :     case E_V32QImode:
   19831            0 :     case E_V16HImode:
   19832            0 :     case E_V16HFmode:
   19833            0 :     case E_V8SImode:
   19834            0 :     case E_V4DImode:
   19835            0 :       if (i == 256)
   19836              :         {
   19837            0 :           if (GET_MODE (dest) != V4DImode)
   19838            0 :             d = gen_reg_rtx (V4DImode);
   19839            0 :           tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
   19840            0 :                                    gen_lowpart (V4DImode, src),
   19841              :                                    const1_rtx);
   19842              :         }
   19843              :       else
   19844              :         {
   19845            0 :           d = gen_reg_rtx (V2TImode);
   19846            0 :           tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
   19847            0 :                                     GEN_INT (i / 2));
   19848              :         }
   19849              :       break;
   19850            0 :     case E_V64QImode:
   19851            0 :     case E_V32HImode:
   19852            0 :     case E_V32HFmode:
   19853            0 :       if (i < 64)
   19854              :         {
   19855            0 :           d = gen_reg_rtx (V4TImode);
   19856            0 :           tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
   19857            0 :                                         GEN_INT (i / 2));
   19858            0 :           break;
   19859              :         }
   19860              :       /* FALLTHRU */
   19861            0 :     case E_V16SImode:
   19862            0 :     case E_V16SFmode:
   19863            0 :     case E_V8DImode:
   19864            0 :     case E_V8DFmode:
   19865            0 :       if (i > 128)
   19866            0 :         tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
   19867            0 :                                         gen_lowpart (V16SImode, src),
   19868            0 :                                         gen_lowpart (V16SImode, src),
   19869              :                                         GEN_INT (0x4 + (i == 512 ? 4 : 0)),
   19870              :                                         GEN_INT (0x5 + (i == 512 ? 4 : 0)),
   19871              :                                         GEN_INT (0x6 + (i == 512 ? 4 : 0)),
   19872              :                                         GEN_INT (0x7 + (i == 512 ? 4 : 0)),
   19873              :                                         GEN_INT (0xC), GEN_INT (0xD),
   19874              :                                         GEN_INT (0xE), GEN_INT (0xF),
   19875              :                                         GEN_INT (0x10), GEN_INT (0x11),
   19876              :                                         GEN_INT (0x12), GEN_INT (0x13),
   19877              :                                         GEN_INT (0x14), GEN_INT (0x15),
   19878              :                                         GEN_INT (0x16), GEN_INT (0x17));
   19879              :       else
   19880            0 :         tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
   19881            0 :                                     gen_lowpart (V16SImode, src),
   19882              :                                     GEN_INT (i == 128 ? 0x2 : 0x1),
   19883              :                                     GEN_INT (0x3),
   19884              :                                     GEN_INT (0x3),
   19885              :                                     GEN_INT (0x3),
   19886              :                                     GEN_INT (i == 128 ? 0x6 : 0x5),
   19887              :                                     GEN_INT (0x7),
   19888              :                                     GEN_INT (0x7),
   19889              :                                     GEN_INT (0x7),
   19890              :                                     GEN_INT (i == 128 ? 0xA : 0x9),
   19891              :                                     GEN_INT (0xB),
   19892              :                                     GEN_INT (0xB),
   19893              :                                     GEN_INT (0xB),
   19894              :                                     GEN_INT (i == 128 ? 0xE : 0xD),
   19895              :                                     GEN_INT (0xF),
   19896              :                                     GEN_INT (0xF),
   19897              :                                     GEN_INT (0xF));
   19898              :       break;
   19899            0 :     default:
   19900            0 :       gcc_unreachable ();
   19901              :     }
   19902        41449 :   emit_insn (tem);
   19903        41449 :   if (d != dest)
   19904        32146 :     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
   19905        41449 : }
   19906              : 
   19907              : /* Expand a vector reduction.  FN is the binary pattern to reduce;
   19908              :    DEST is the destination; IN is the input vector.  */
   19909              : 
   19910              : void
   19911        20546 : ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
   19912              : {
   19913        20546 :   rtx half, dst, vec = in;
   19914        20546 :   machine_mode mode = GET_MODE (in);
   19915        20546 :   int i;
   19916              : 
   19917              :   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
   19918        20546 :   if (TARGET_SSE4_1
   19919         9878 :       && mode == V8HImode
   19920          780 :       && fn == gen_uminv8hi3)
   19921              :     {
   19922            4 :       emit_insn (gen_sse4_1_phminposuw (dest, in));
   19923            4 :       return;
   19924              :     }
   19925              : 
   19926        41084 :   for (i = GET_MODE_BITSIZE (mode);
   19927       123982 :        i > GET_MODE_UNIT_BITSIZE (mode);
   19928        41449 :        i >>= 1)
   19929              :     {
   19930        41449 :       half = gen_reg_rtx (mode);
   19931        41449 :       emit_reduc_half (half, vec, i);
   19932        82898 :       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
   19933              :         dst = dest;
   19934              :       else
   19935        20907 :         dst = gen_reg_rtx (mode);
   19936        41449 :       emit_insn (fn (dst, half, vec));
   19937        41449 :       vec = dst;
   19938              :     }
   19939              : }
   19940              : 
   19941              : /* Output code to perform a conditional jump to LABEL, if C2 flag in
   19942              :    FP status register is set.  */
   19943              : 
   19944              : void
   19945          284 : ix86_emit_fp_unordered_jump (rtx label)
   19946              : {
   19947          284 :   rtx reg = gen_reg_rtx (HImode);
   19948          284 :   rtx_insn *insn;
   19949          284 :   rtx temp;
   19950              : 
   19951          284 :   emit_insn (gen_x86_fnstsw_1 (reg));
   19952              : 
   19953          284 :   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
   19954              :     {
   19955           37 :       emit_insn (gen_x86_sahf_1 (reg));
   19956              : 
   19957           37 :       temp = gen_rtx_REG (CCmode, FLAGS_REG);
   19958           37 :       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
   19959              :     }
   19960              :   else
   19961              :     {
   19962          247 :       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
   19963              : 
   19964          247 :       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   19965          247 :       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
   19966              :     }
   19967              : 
   19968          284 :   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
   19969              :                               gen_rtx_LABEL_REF (VOIDmode, label),
   19970              :                               pc_rtx);
   19971          284 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
   19972          284 :   predict_jump (REG_BR_PROB_BASE * 10 / 100);
   19973          284 :   JUMP_LABEL (insn) = label;
   19974          284 : }
   19975              : 
   19976              : /* Output code to perform an sinh XFmode calculation.  */
   19977              : 
   19978              : void
   19979            2 : ix86_emit_i387_sinh (rtx op0, rtx op1)
   19980              : {
   19981            2 :   rtx e1 = gen_reg_rtx (XFmode);
   19982            2 :   rtx e2 = gen_reg_rtx (XFmode);
   19983            2 :   rtx scratch = gen_reg_rtx (HImode);
   19984            2 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   19985            2 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   19986            2 :   rtx cst1, tmp;
   19987            2 :   rtx_code_label *jump_label = gen_label_rtx ();
   19988            2 :   rtx_insn *insn;
   19989              : 
   19990              :   /* scratch = fxam (op1) */
   19991            2 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   19992              : 
   19993              :   /* e1 = expm1 (|op1|) */
   19994            2 :   emit_insn (gen_absxf2 (e2, op1));
   19995            2 :   emit_insn (gen_expm1xf2 (e1, e2));
   19996              : 
   19997              :   /* e2 = e1 / (e1 + 1.0) + e1 */
   19998            2 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   19999            2 :   emit_insn (gen_addxf3 (e2, e1, cst1));
   20000            2 :   emit_insn (gen_divxf3 (e2, e1, e2));
   20001            2 :   emit_insn (gen_addxf3 (e2, e2, e1));
   20002              : 
   20003              :   /* flags = signbit (op1) */
   20004            2 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20005              : 
   20006              :   /* if (flags) then e2 = -e2 */
   20007            2 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20008              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   20009              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20010              :                               pc_rtx);
   20011            2 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20012            2 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20013            2 :   JUMP_LABEL (insn) = jump_label;
   20014              : 
   20015            2 :   emit_insn (gen_negxf2 (e2, e2));
   20016              : 
   20017            2 :   emit_label (jump_label);
   20018            2 :   LABEL_NUSES (jump_label) = 1;
   20019              : 
   20020              :   /* op0 = 0.5 * e2 */
   20021            2 :   half = force_reg (XFmode, half);
   20022            2 :   emit_insn (gen_mulxf3 (op0, e2, half));
   20023            2 : }
   20024              : 
   20025              : /* Output code to perform an cosh XFmode calculation.  */
   20026              : 
   20027              : void
   20028            3 : ix86_emit_i387_cosh (rtx op0, rtx op1)
   20029              : {
   20030            3 :   rtx e1 = gen_reg_rtx (XFmode);
   20031            3 :   rtx e2 = gen_reg_rtx (XFmode);
   20032            3 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   20033            3 :   rtx cst1;
   20034              : 
   20035              :   /* e1 = exp (op1) */
   20036            3 :   emit_insn (gen_expxf2 (e1, op1));
   20037              : 
   20038              :   /* e2 = e1 + 1.0 / e1 */
   20039            3 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20040            3 :   emit_insn (gen_divxf3 (e2, cst1, e1));
   20041            3 :   emit_insn (gen_addxf3 (e2, e1, e2));
   20042              : 
   20043              :   /* op0 = 0.5 * e2 */
   20044            3 :   half = force_reg (XFmode, half);
   20045            3 :   emit_insn (gen_mulxf3 (op0, e2, half));
   20046            3 : }
   20047              : 
   20048              : /* Output code to perform an tanh XFmode calculation.  */
   20049              : 
   20050              : void
   20051            1 : ix86_emit_i387_tanh (rtx op0, rtx op1)
   20052              : {
   20053            1 :   rtx e1 = gen_reg_rtx (XFmode);
   20054            1 :   rtx e2 = gen_reg_rtx (XFmode);
   20055            1 :   rtx scratch = gen_reg_rtx (HImode);
   20056            1 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20057            1 :   rtx cst2, tmp;
   20058            1 :   rtx_code_label *jump_label = gen_label_rtx ();
   20059            1 :   rtx_insn *insn;
   20060              : 
   20061              :   /* scratch = fxam (op1) */
   20062            1 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20063              : 
   20064              :   /* e1 = expm1 (-|2 * op1|) */
   20065            1 :   emit_insn (gen_addxf3 (e2, op1, op1));
   20066            1 :   emit_insn (gen_absxf2 (e2, e2));
   20067            1 :   emit_insn (gen_negxf2 (e2, e2));
   20068            1 :   emit_insn (gen_expm1xf2 (e1, e2));
   20069              : 
   20070              :   /* e2 = e1 / (e1 + 2.0) */
   20071            1 :   cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
   20072            1 :   emit_insn (gen_addxf3 (e2, e1, cst2));
   20073            1 :   emit_insn (gen_divxf3 (e2, e1, e2));
   20074              : 
   20075              :   /* flags = signbit (op1) */
   20076            1 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20077              : 
   20078              :   /* if (!flags) then e2 = -e2 */
   20079            1 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20080              :                               gen_rtx_NE (VOIDmode, flags, const0_rtx),
   20081              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20082              :                               pc_rtx);
   20083            1 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20084            1 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20085            1 :   JUMP_LABEL (insn) = jump_label;
   20086              : 
   20087            1 :   emit_insn (gen_negxf2 (e2, e2));
   20088              : 
   20089            1 :   emit_label (jump_label);
   20090            1 :   LABEL_NUSES (jump_label) = 1;
   20091              : 
   20092            1 :   emit_move_insn (op0, e2);
   20093            1 : }
   20094              : 
   20095              : /* Output code to perform an asinh XFmode calculation.  */
   20096              : 
   20097              : void
   20098            0 : ix86_emit_i387_asinh (rtx op0, rtx op1)
   20099              : {
   20100            0 :   rtx e1 = gen_reg_rtx (XFmode);
   20101            0 :   rtx e2 = gen_reg_rtx (XFmode);
   20102            0 :   rtx scratch = gen_reg_rtx (HImode);
   20103            0 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20104            0 :   rtx cst1, tmp;
   20105            0 :   rtx_code_label *jump_label = gen_label_rtx ();
   20106            0 :   rtx_insn *insn;
   20107              : 
   20108              :   /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
   20109            0 :   emit_insn (gen_mulxf3 (e1, op1, op1));
   20110            0 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20111            0 :   emit_insn (gen_addxf3 (e2, e1, cst1));
   20112            0 :   emit_insn (gen_sqrtxf2 (e2, e2));
   20113            0 :   emit_insn (gen_addxf3 (e2, e2, cst1));
   20114              : 
   20115              :   /* e1 = e1 / e2 */
   20116            0 :   emit_insn (gen_divxf3 (e1, e1, e2));
   20117              : 
   20118              :   /* scratch = fxam (op1) */
   20119            0 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20120              : 
   20121              :   /* e1 = e1 + |op1| */
   20122            0 :   emit_insn (gen_absxf2 (e2, op1));
   20123            0 :   emit_insn (gen_addxf3 (e1, e1, e2));
   20124              : 
   20125              :   /* e2 = log1p (e1) */
   20126            0 :   ix86_emit_i387_log1p (e2, e1);
   20127              : 
   20128              :   /* flags = signbit (op1) */
   20129            0 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20130              : 
   20131              :   /* if (flags) then e2 = -e2 */
   20132            0 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20133              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   20134              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20135              :                               pc_rtx);
   20136            0 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20137            0 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20138            0 :   JUMP_LABEL (insn) = jump_label;
   20139              : 
   20140            0 :   emit_insn (gen_negxf2 (e2, e2));
   20141              : 
   20142            0 :   emit_label (jump_label);
   20143            0 :   LABEL_NUSES (jump_label) = 1;
   20144              : 
   20145            0 :   emit_move_insn (op0, e2);
   20146            0 : }
   20147              : 
   20148              : /* Output code to perform an acosh XFmode calculation.  */
   20149              : 
   20150              : void
   20151            0 : ix86_emit_i387_acosh (rtx op0, rtx op1)
   20152              : {
   20153            0 :   rtx e1 = gen_reg_rtx (XFmode);
   20154            0 :   rtx e2 = gen_reg_rtx (XFmode);
   20155            0 :   rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20156              : 
   20157              :   /* e2 = sqrt (op1 + 1.0) */
   20158            0 :   emit_insn (gen_addxf3 (e2, op1, cst1));
   20159            0 :   emit_insn (gen_sqrtxf2 (e2, e2));
   20160              : 
   20161              :   /* e1 = sqrt (op1 - 1.0) */
   20162            0 :   emit_insn (gen_subxf3 (e1, op1, cst1));
   20163            0 :   emit_insn (gen_sqrtxf2 (e1, e1));
   20164              : 
   20165              :   /* e1 = e1 * e2 */
   20166            0 :   emit_insn (gen_mulxf3 (e1, e1, e2));
   20167              : 
   20168              :   /* e1 = e1 + op1 */
   20169            0 :   emit_insn (gen_addxf3 (e1, e1, op1));
   20170              : 
   20171              :   /* op0 = log (e1) */
   20172            0 :   emit_insn (gen_logxf2 (op0, e1));
   20173            0 : }
   20174              : 
   20175              : /* Output code to perform an atanh XFmode calculation.  */
   20176              : 
   20177              : void
   20178            4 : ix86_emit_i387_atanh (rtx op0, rtx op1)
   20179              : {
   20180            4 :   rtx e1 = gen_reg_rtx (XFmode);
   20181            4 :   rtx e2 = gen_reg_rtx (XFmode);
   20182            4 :   rtx scratch = gen_reg_rtx (HImode);
   20183            4 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20184            4 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   20185            4 :   rtx cst1, tmp;
   20186            4 :   rtx_code_label *jump_label = gen_label_rtx ();
   20187            4 :   rtx_insn *insn;
   20188              : 
   20189              :   /* scratch = fxam (op1) */
   20190            4 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20191              : 
   20192              :   /* e2 = |op1| */
   20193            4 :   emit_insn (gen_absxf2 (e2, op1));
   20194              : 
   20195              :   /* e1 = -(e2 + e2) / (e2 + 1.0) */
   20196            4 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20197            4 :   emit_insn (gen_addxf3 (e1, e2, cst1));
   20198            4 :   emit_insn (gen_addxf3 (e2, e2, e2));
   20199            4 :   emit_insn (gen_negxf2 (e2, e2));
   20200            4 :   emit_insn (gen_divxf3 (e1, e2, e1));
   20201              : 
   20202              :   /* e2 = log1p (e1) */
   20203            4 :   ix86_emit_i387_log1p (e2, e1);
   20204              : 
   20205              :   /* flags = signbit (op1) */
   20206            4 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20207              : 
   20208              :   /* if (!flags) then e2 = -e2 */
   20209            4 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20210              :                               gen_rtx_NE (VOIDmode, flags, const0_rtx),
   20211              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20212              :                               pc_rtx);
   20213            4 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20214            4 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20215            4 :   JUMP_LABEL (insn) = jump_label;
   20216              : 
   20217            4 :   emit_insn (gen_negxf2 (e2, e2));
   20218              : 
   20219            4 :   emit_label (jump_label);
   20220            4 :   LABEL_NUSES (jump_label) = 1;
   20221              : 
   20222              :   /* op0 = 0.5 * e2 */
   20223            4 :   half = force_reg (XFmode, half);
   20224            4 :   emit_insn (gen_mulxf3 (op0, e2, half));
   20225            4 : }
   20226              : 
   20227              : /* Output code to perform a log1p XFmode calculation.  */
   20228              : 
   20229              : void
   20230            5 : ix86_emit_i387_log1p (rtx op0, rtx op1)
   20231              : {
   20232            5 :   rtx_code_label *label1 = gen_label_rtx ();
   20233            5 :   rtx_code_label *label2 = gen_label_rtx ();
   20234              : 
   20235            5 :   rtx tmp = gen_reg_rtx (XFmode);
   20236            5 :   rtx res = gen_reg_rtx (XFmode);
   20237            5 :   rtx cst, cstln2, cst1;
   20238            5 :   rtx_insn *insn;
   20239              : 
   20240              :   /* The emit_jump call emits pending stack adjust, make sure it is emitted
   20241              :      before the conditional jump, otherwise the stack adjustment will be
   20242              :      only conditional.  */
   20243            5 :   do_pending_stack_adjust ();
   20244              : 
   20245            5 :   cst = const_double_from_real_value
   20246            5 :     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
   20247            5 :   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
   20248              : 
   20249            5 :   emit_insn (gen_absxf2 (tmp, op1));
   20250              : 
   20251            5 :   cst = force_reg (XFmode, cst);
   20252            5 :   ix86_expand_branch (GE, tmp, cst, label1);
   20253            5 :   predict_jump (REG_BR_PROB_BASE * 10 / 100);
   20254            5 :   insn = get_last_insn ();
   20255            5 :   JUMP_LABEL (insn) = label1;
   20256              : 
   20257            5 :   emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
   20258            5 :   emit_jump (label2);
   20259              : 
   20260            5 :   emit_label (label1);
   20261            5 :   LABEL_NUSES (label1) = 1;
   20262              : 
   20263            5 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20264            5 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
   20265            5 :   emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
   20266              : 
   20267            5 :   emit_label (label2);
   20268            5 :   LABEL_NUSES (label2) = 1;
   20269              : 
   20270            5 :   emit_move_insn (op0, res);
   20271            5 : }
   20272              : 
   20273              : /* Emit code for round calculation.  */
   20274              : void
   20275           60 : ix86_emit_i387_round (rtx op0, rtx op1)
   20276              : {
   20277           60 :   machine_mode inmode = GET_MODE (op1);
   20278           60 :   machine_mode outmode = GET_MODE (op0);
   20279           60 :   rtx e1 = gen_reg_rtx (XFmode);
   20280           60 :   rtx e2 = gen_reg_rtx (XFmode);
   20281           60 :   rtx scratch = gen_reg_rtx (HImode);
   20282           60 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20283           60 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   20284           60 :   rtx res = gen_reg_rtx (outmode);
   20285           60 :   rtx_code_label *jump_label = gen_label_rtx ();
   20286           60 :   rtx (*floor_insn) (rtx, rtx);
   20287           60 :   rtx (*neg_insn) (rtx, rtx);
   20288           60 :   rtx_insn *insn;
   20289           60 :   rtx tmp;
   20290              : 
   20291           60 :   switch (inmode)
   20292              :     {
   20293           29 :     case E_SFmode:
   20294           29 :     case E_DFmode:
   20295           29 :       tmp = gen_reg_rtx (XFmode);
   20296              : 
   20297           29 :       emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
   20298           29 :       op1 = tmp;
   20299           29 :       break;
   20300              :     case E_XFmode:
   20301              :       break;
   20302            0 :     default:
   20303            0 :       gcc_unreachable ();
   20304              :     }
   20305              : 
   20306           60 :   switch (outmode)
   20307              :     {
   20308              :     case E_SFmode:
   20309              :       floor_insn = gen_frndintxf2_floor;
   20310              :       neg_insn = gen_negsf2;
   20311              :       break;
   20312            6 :     case E_DFmode:
   20313            6 :       floor_insn = gen_frndintxf2_floor;
   20314            6 :       neg_insn = gen_negdf2;
   20315            6 :       break;
   20316           10 :     case E_XFmode:
   20317           10 :       floor_insn = gen_frndintxf2_floor;
   20318           10 :       neg_insn = gen_negxf2;
   20319           10 :       break;
   20320            0 :     case E_HImode:
   20321            0 :       floor_insn = gen_lfloorxfhi2;
   20322            0 :       neg_insn = gen_neghi2;
   20323            0 :       break;
   20324            6 :     case E_SImode:
   20325            6 :       floor_insn = gen_lfloorxfsi2;
   20326            6 :       neg_insn = gen_negsi2;
   20327            6 :       break;
   20328           36 :     case E_DImode:
   20329           36 :       floor_insn = gen_lfloorxfdi2;
   20330           36 :       neg_insn = gen_negdi2;
   20331           36 :       break;
   20332            0 :     default:
   20333            0 :       gcc_unreachable ();
   20334              :     }
   20335              : 
   20336              :   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
   20337              : 
   20338              :   /* scratch = fxam(op1) */
   20339           60 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20340              : 
   20341              :   /* e1 = fabs(op1) */
   20342           60 :   emit_insn (gen_absxf2 (e1, op1));
   20343              : 
   20344              :   /* e2 = e1 + 0.5 */
   20345           60 :   half = force_reg (XFmode, half);
   20346           60 :   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
   20347              : 
   20348              :   /* res = floor(e2) */
   20349           60 :   switch (outmode)
   20350              :     {
   20351            8 :     case E_SFmode:
   20352            8 :     case E_DFmode:
   20353            8 :       {
   20354            8 :         tmp = gen_reg_rtx (XFmode);
   20355              : 
   20356            8 :         emit_insn (floor_insn (tmp, e2));
   20357            8 :         emit_insn (gen_rtx_SET (res,
   20358              :                                 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
   20359              :                                                 UNSPEC_TRUNC_NOOP)));
   20360              :       }
   20361            8 :       break;
   20362           52 :     default:
   20363           52 :       emit_insn (floor_insn (res, e2));
   20364              :     }
   20365              : 
   20366              :   /* flags = signbit(a) */
   20367           60 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20368              : 
   20369              :   /* if (flags) then res = -res */
   20370           60 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20371              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   20372              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20373              :                               pc_rtx);
   20374           60 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20375           60 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20376           60 :   JUMP_LABEL (insn) = jump_label;
   20377              : 
   20378           60 :   emit_insn (neg_insn (res, res));
   20379              : 
   20380           60 :   emit_label (jump_label);
   20381           60 :   LABEL_NUSES (jump_label) = 1;
   20382              : 
   20383           60 :   emit_move_insn (op0, res);
   20384           60 : }
   20385              : 
   20386              : /* Output code to perform a Newton-Rhapson approximation of a single precision
   20387              :    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
   20388              : 
   20389              : void
   20390           56 : ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
   20391              : {
   20392           56 :   rtx x0, x1, e0, e1;
   20393              : 
   20394           56 :   x0 = gen_reg_rtx (mode);
   20395           56 :   e0 = gen_reg_rtx (mode);
   20396           56 :   e1 = gen_reg_rtx (mode);
   20397           56 :   x1 = gen_reg_rtx (mode);
   20398              : 
   20399           56 :   b = force_reg (mode, b);
   20400              : 
   20401              :   /* x0 = rcp(b) estimate */
   20402           56 :   if (mode == V16SFmode || mode == V8DFmode)
   20403              :     {
   20404            0 :       emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   20405              :                                                   UNSPEC_RCP14)));
   20406              :     }
   20407              :   else
   20408           56 :     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   20409              :                                                 UNSPEC_RCP)));
   20410              : 
   20411           56 :   unsigned vector_size = GET_MODE_SIZE (mode);
   20412              : 
   20413              :   /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
   20414              :      N-R step with 2 fma implementation.  */
   20415           56 :   if (TARGET_FMA
   20416           55 :       || (TARGET_AVX512F && vector_size == 64)
   20417           55 :       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
   20418              :     {
   20419              :       /* e0 = x0 * a  */
   20420            1 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
   20421              :       /* e1 = e0 * b - a  */
   20422            1 :       emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
   20423              :                                                gen_rtx_NEG (mode, a))));
   20424              :       /* res = - e1 * x0 + e0  */
   20425            1 :       emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
   20426              :                                                gen_rtx_NEG (mode, e1),
   20427              :                                                x0, e0)));
   20428              :     }
   20429              :   else
   20430              :     /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
   20431              :     {
   20432              :       /* e0 = x0 * b */
   20433           55 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
   20434              : 
   20435              :       /* e1 = x0 + x0 */
   20436           55 :       emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
   20437              : 
   20438              :       /* e0 = x0 * e0 */
   20439           55 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
   20440              : 
   20441              :       /* x1 = e1 - e0 */
   20442           55 :       emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
   20443              : 
   20444              :       /* res = a * x1 */
   20445           55 :       emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
   20446              :     }
   20447           56 : }
   20448              : 
   20449              : /* Output code to perform a Newton-Rhapson approximation of a
   20450              :    single precision floating point [reciprocal] square root.  */
   20451              : 
   20452              : void
   20453           85 : ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
   20454              : {
   20455           85 :   rtx x0, e0, e1, e2, e3, mthree, mhalf;
   20456           85 :   REAL_VALUE_TYPE r;
   20457           85 :   int unspec;
   20458              : 
   20459           85 :   x0 = gen_reg_rtx (mode);
   20460           85 :   e0 = gen_reg_rtx (mode);
   20461           85 :   e1 = gen_reg_rtx (mode);
   20462           85 :   e2 = gen_reg_rtx (mode);
   20463           85 :   e3 = gen_reg_rtx (mode);
   20464              : 
   20465           85 :   real_from_integer (&r, VOIDmode, -3, SIGNED);
   20466           85 :   mthree = const_double_from_real_value (r, SFmode);
   20467              : 
   20468           85 :   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
   20469           85 :   mhalf = const_double_from_real_value (r, SFmode);
   20470           85 :   unspec = UNSPEC_RSQRT;
   20471              : 
   20472           85 :   if (VECTOR_MODE_P (mode))
   20473              :     {
   20474           66 :       mthree = ix86_build_const_vector (mode, true, mthree);
   20475           66 :       mhalf = ix86_build_const_vector (mode, true, mhalf);
   20476              :       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
   20477          132 :       if (GET_MODE_SIZE (mode) == 64)
   20478            0 :         unspec = UNSPEC_RSQRT14;
   20479              :     }
   20480              : 
   20481              :   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
   20482              :      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
   20483              : 
   20484           85 :   a = force_reg (mode, a);
   20485              : 
   20486              :   /* x0 = rsqrt(a) estimate */
   20487           85 :   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
   20488              :                                               unspec)));
   20489              : 
   20490              :   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
   20491           85 :   if (!recip)
   20492              :     {
   20493           57 :       rtx zero = force_reg (mode, CONST0_RTX(mode));
   20494           57 :       rtx mask;
   20495              : 
   20496              :       /* Handle masked compare.  */
   20497          110 :       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
   20498              :         {
   20499            0 :           mask = gen_reg_rtx (HImode);
   20500              :           /* Imm value 0x4 corresponds to not-equal comparison.  */
   20501            0 :           emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
   20502            0 :           emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
   20503              :         }
   20504              :       else
   20505              :         {
   20506           57 :           mask = gen_reg_rtx (mode);
   20507           57 :           emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
   20508           57 :           emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
   20509              :         }
   20510              :     }
   20511              : 
   20512           85 :   mthree = force_reg (mode, mthree);
   20513              : 
   20514              :   /* e0 = x0 * a */
   20515           85 :   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
   20516              : 
   20517           85 :   unsigned vector_size = GET_MODE_SIZE (mode);
   20518           85 :   if (TARGET_FMA
   20519           77 :       || (TARGET_AVX512F && vector_size == 64)
   20520           77 :       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
   20521           16 :     emit_insn (gen_rtx_SET (e2,
   20522              :                             gen_rtx_FMA (mode, e0, x0, mthree)));
   20523              :   else
   20524              :     {
   20525              :       /* e1 = e0 * x0 */
   20526           69 :       emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
   20527              : 
   20528              :       /* e2 = e1 - 3. */
   20529           69 :       emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
   20530              :     }
   20531              : 
   20532           85 :   mhalf = force_reg (mode, mhalf);
   20533           85 :   if (recip)
   20534              :     /* e3 = -.5 * x0 */
   20535           28 :     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
   20536              :   else
   20537              :     /* e3 = -.5 * e0 */
   20538           57 :     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
   20539              :   /* ret = e2 * e3 */
   20540           85 :   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
   20541           85 : }
   20542              : 
   20543              : /* Expand fabs (OP0) and return a new rtx that holds the result.  The
   20544              :    mask for masking out the sign-bit is stored in *SMASK, if that is
   20545              :    non-null.  */
   20546              : 
   20547              : static rtx
   20548         1048 : ix86_expand_sse_fabs (rtx op0, rtx *smask)
   20549              : {
   20550         1048 :   machine_mode vmode, mode = GET_MODE (op0);
   20551         1048 :   rtx xa, mask;
   20552              : 
   20553         1048 :   xa = gen_reg_rtx (mode);
   20554         1048 :   if (mode == SFmode)
   20555              :     vmode = V4SFmode;
   20556          466 :   else if (mode == DFmode)
   20557              :     vmode = V2DFmode;
   20558              :   else
   20559            0 :     vmode = mode;
   20560         1048 :   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
   20561         1048 :   if (!VECTOR_MODE_P (mode))
   20562              :     {
   20563              :       /* We need to generate a scalar mode mask in this case.  */
   20564         1048 :       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   20565         1048 :       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
   20566         1048 :       mask = gen_reg_rtx (mode);
   20567         1048 :       emit_insn (gen_rtx_SET (mask, tmp));
   20568              :     }
   20569         1048 :   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
   20570              : 
   20571         1048 :   if (smask)
   20572          995 :     *smask = mask;
   20573              : 
   20574         1048 :   return xa;
   20575              : }
   20576              : 
   20577              : /* Expands a comparison of OP0 with OP1 using comparison code CODE,
   20578              :    swapping the operands if SWAP_OPERANDS is true.  The expanded
   20579              :    code is a forward jump to a newly created label in case the
   20580              :    comparison is true.  The generated label rtx is returned.  */
   20581              : static rtx_code_label *
   20582         1063 : ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
   20583              :                                   bool swap_operands)
   20584              : {
   20585         1063 :   bool unordered_compare = ix86_unordered_fp_compare (code);
   20586         1063 :   rtx_code_label *label;
   20587         1063 :   rtx tmp, reg;
   20588              : 
   20589         1063 :   if (swap_operands)
   20590           34 :     std::swap (op0, op1);
   20591              : 
   20592         1063 :   label = gen_label_rtx ();
   20593         1063 :   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
   20594         1063 :   if (unordered_compare)
   20595          907 :     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
   20596         1063 :   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
   20597         1063 :   emit_insn (gen_rtx_SET (reg, tmp));
   20598         1063 :   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
   20599         1063 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   20600              :                               gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
   20601         1063 :   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20602         1063 :   JUMP_LABEL (tmp) = label;
   20603              : 
   20604         1063 :   return label;
   20605              : }
   20606              : 
   20607              : /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
   20608              :    using comparison code CODE.  Operands are swapped for the comparison if
   20609              :    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
   20610              : static rtx
   20611          539 : ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
   20612              :                               bool swap_operands)
   20613              : {
   20614          539 :   rtx (*insn)(rtx, rtx, rtx, rtx);
   20615          539 :   machine_mode mode = GET_MODE (op0);
   20616          539 :   rtx mask = gen_reg_rtx (mode);
   20617              : 
   20618          539 :   if (swap_operands)
   20619          362 :     std::swap (op0, op1);
   20620              : 
   20621          539 :   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
   20622              : 
   20623          539 :   emit_insn (insn (mask, op0, op1,
   20624              :                    gen_rtx_fmt_ee (code, mode, op0, op1)));
   20625          539 :   return mask;
   20626              : }
   20627              : 
   20628              : /* Expand copysign from SIGN to the positive value ABS_VALUE
   20629              :    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
   20630              :    the sign-bit.  */
   20631              : 
   20632              : static void
   20633         1015 : ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
   20634              : {
   20635         1015 :   machine_mode mode = GET_MODE (sign);
   20636         1015 :   rtx sgn = gen_reg_rtx (mode);
   20637         1015 :   if (mask == NULL_RTX)
   20638              :     {
   20639           28 :       machine_mode vmode;
   20640              : 
   20641           28 :       if (mode == SFmode)
   20642              :         vmode = V4SFmode;
   20643              :       else if (mode == DFmode)
   20644              :         vmode = V2DFmode;
   20645              :       else if (mode == HFmode)
   20646              :         vmode = V8HFmode;
   20647              :       else
   20648           28 :         vmode = mode;
   20649              : 
   20650           28 :       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
   20651           28 :       if (!VECTOR_MODE_P (mode))
   20652              :         {
   20653              :           /* We need to generate a scalar mode mask in this case.  */
   20654           28 :           rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   20655           28 :           tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
   20656           28 :           mask = gen_reg_rtx (mode);
   20657           28 :           emit_insn (gen_rtx_SET (mask, tmp));
   20658              :         }
   20659              :     }
   20660              :   else
   20661          987 :     mask = gen_rtx_NOT (mode, mask);
   20662         1015 :   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
   20663         1015 :   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
   20664         1015 : }
   20665              : 
   20666              : /* Expand SSE sequence for computing lround from OP1 storing
   20667              :    into OP0.  */
   20668              : 
   20669              : void
   20670           28 : ix86_expand_lround (rtx op0, rtx op1)
   20671              : {
   20672              :   /* C code for the stuff we're doing below:
   20673              :         tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
   20674              :         return (long)tmp;
   20675              :    */
   20676           28 :   machine_mode mode = GET_MODE (op1);
   20677           28 :   const struct real_format *fmt;
   20678           28 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   20679           28 :   rtx adj;
   20680              : 
   20681              :   /* load nextafter (0.5, 0.0) */
   20682           28 :   fmt = REAL_MODE_FORMAT (mode);
   20683           28 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   20684           28 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   20685              : 
   20686              :   /* adj = copysign (0.5, op1) */
   20687           28 :   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
   20688           28 :   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
   20689              : 
   20690              :   /* adj = op1 + adj */
   20691           28 :   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
   20692              : 
   20693              :   /* op0 = (imode)adj */
   20694           28 :   expand_fix (op0, adj, 0);
   20695           28 : }
   20696              : 
   20697              : /* Expand SSE2 sequence for computing lround from OPERAND1 storing
   20698              :    into OPERAND0.  */
   20699              : 
   20700              : void
   20701           68 : ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
   20702              : {
   20703              :   /* C code for the stuff we're doing below (for do_floor):
   20704              :         xi = (long)op1;
   20705              :         xi -= (double)xi > op1 ? 1 : 0;
   20706              :         return xi;
   20707              :    */
   20708           68 :   machine_mode fmode = GET_MODE (op1);
   20709           68 :   machine_mode imode = GET_MODE (op0);
   20710           68 :   rtx ireg, freg, tmp;
   20711           68 :   rtx_code_label *label;
   20712              : 
   20713              :   /* reg = (long)op1 */
   20714           68 :   ireg = gen_reg_rtx (imode);
   20715           68 :   expand_fix (ireg, op1, 0);
   20716              : 
   20717              :   /* freg = (double)reg */
   20718           68 :   freg = gen_reg_rtx (fmode);
   20719           68 :   expand_float (freg, ireg, 0);
   20720              : 
   20721              :   /* ireg = (freg > op1) ? ireg - 1 : ireg */
   20722          136 :   label = ix86_expand_sse_compare_and_jump (UNLE,
   20723           68 :                                             freg, op1, !do_floor);
   20724          102 :   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
   20725              :                              ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
   20726           68 :   emit_move_insn (ireg, tmp);
   20727              : 
   20728           68 :   emit_label (label);
   20729           68 :   LABEL_NUSES (label) = 1;
   20730              : 
   20731           68 :   emit_move_insn (op0, ireg);
   20732           68 : }
   20733              : 
   20734              : /* Generate and return a rtx of mode MODE for 2**n where n is the number
   20735              :    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
   20736              : 
   20737              : static rtx
   20738          995 : ix86_gen_TWO52 (machine_mode mode)
   20739              : {
   20740          995 :   const struct real_format *fmt;
   20741          995 :   REAL_VALUE_TYPE TWO52r;
   20742          995 :   rtx TWO52;
   20743              : 
   20744          995 :   fmt = REAL_MODE_FORMAT (mode);
   20745          995 :   real_2expN (&TWO52r, fmt->p - 1, mode);
   20746          995 :   TWO52 = const_double_from_real_value (TWO52r, mode);
   20747          995 :   TWO52 = force_reg (mode, TWO52);
   20748              : 
   20749          995 :   return TWO52;
   20750              : }
   20751              : 
   20752              : /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
   20753              : 
   20754              : void
   20755          121 : ix86_expand_rint (rtx operand0, rtx operand1)
   20756              : {
   20757              :   /* C code for the stuff we're doing below:
   20758              :         xa = fabs (operand1);
   20759              :         if (!isless (xa, 2**52))
   20760              :           return operand1;
   20761              :         two52 = 2**52;
   20762              :         if (flag_rounding_math)
   20763              :           {
   20764              :             two52 = copysign (two52, operand1);
   20765              :             xa = operand1;
   20766              :           }
   20767              :         xa = xa + two52 - two52;
   20768              :         return copysign (xa, operand1);
   20769              :    */
   20770          121 :   machine_mode mode = GET_MODE (operand0);
   20771          121 :   rtx res, xa, TWO52, mask;
   20772          121 :   rtx_code_label *label;
   20773              : 
   20774          121 :   TWO52 = ix86_gen_TWO52 (mode);
   20775              : 
   20776              :   /* Temporary for holding the result, initialized to the input
   20777              :      operand to ease control flow.  */
   20778          121 :   res = copy_to_reg (operand1);
   20779              : 
   20780              :   /* xa = abs (operand1) */
   20781          121 :   xa = ix86_expand_sse_fabs (res, &mask);
   20782              : 
   20783              :   /* if (!isless (xa, TWO52)) goto label; */
   20784          121 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20785              : 
   20786          121 :   if (flag_rounding_math)
   20787              :     {
   20788           53 :       ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
   20789           53 :       xa = res;
   20790              :     }
   20791              : 
   20792          121 :   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   20793          121 :   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
   20794              : 
   20795              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20796          121 :   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
   20797           53 :     xa = ix86_expand_sse_fabs (xa, NULL);
   20798              : 
   20799          121 :   ix86_sse_copysign_to_positive (res, xa, res, mask);
   20800              : 
   20801          121 :   emit_label (label);
   20802          121 :   LABEL_NUSES (label) = 1;
   20803              : 
   20804          121 :   emit_move_insn (operand0, res);
   20805          121 : }
   20806              : 
   20807              : /* Expand SSE2 sequence for computing floor or ceil
   20808              :    from OPERAND1 storing into OPERAND0.  */
   20809              : void
   20810          539 : ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
   20811              : {
   20812              :   /* C code for the stuff we expand below.
   20813              :         double xa = fabs (x), x2;
   20814              :         if (!isless (xa, TWO52))
   20815              :           return x;
   20816              :         x2 = (double)(long)x;
   20817              : 
   20818              :      Compensate.  Floor:
   20819              :         if (x2 > x)
   20820              :           x2 -= 1;
   20821              :      Compensate.  Ceil:
   20822              :         if (x2 < x)
   20823              :           x2 += 1;
   20824              : 
   20825              :         if (HONOR_SIGNED_ZEROS (mode))
   20826              :           return copysign (x2, x);
   20827              :         return x2;
   20828              :    */
   20829          539 :   machine_mode mode = GET_MODE (operand0);
   20830          539 :   rtx xa, xi, TWO52, tmp, one, res, mask;
   20831          539 :   rtx_code_label *label;
   20832              : 
   20833          539 :   TWO52 = ix86_gen_TWO52 (mode);
   20834              : 
   20835              :   /* Temporary for holding the result, initialized to the input
   20836              :      operand to ease control flow.  */
   20837          539 :   res = copy_to_reg (operand1);
   20838              : 
   20839              :   /* xa = abs (operand1) */
   20840          539 :   xa = ix86_expand_sse_fabs (res, &mask);
   20841              : 
   20842              :   /* if (!isless (xa, TWO52)) goto label; */
   20843          539 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20844              : 
   20845              :   /* xa = (double)(long)x */
   20846          539 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   20847          539 :   expand_fix (xi, res, 0);
   20848          539 :   expand_float (xa, xi, 0);
   20849              : 
   20850              :   /* generate 1.0 */
   20851          539 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   20852              : 
   20853              :   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
   20854          539 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
   20855          539 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   20856          901 :   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
   20857              :                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   20858          539 :   if (HONOR_SIGNED_ZEROS (mode))
   20859              :     {
   20860              :       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20861          492 :       if (do_floor && flag_rounding_math)
   20862            0 :         tmp = ix86_expand_sse_fabs (tmp, NULL);
   20863              : 
   20864          492 :       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
   20865              :     }
   20866          539 :   emit_move_insn (res, tmp);
   20867              : 
   20868          539 :   emit_label (label);
   20869          539 :   LABEL_NUSES (label) = 1;
   20870              : 
   20871          539 :   emit_move_insn (operand0, res);
   20872          539 : }
   20873              : 
   20874              : /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
   20875              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   20876              :    that is only available on 64bit targets.  */
   20877              : void
   20878            0 : ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
   20879              : {
   20880              :   /* C code for the stuff we expand below.
   20881              :         double xa = fabs (x), x2;
   20882              :         if (!isless (xa, TWO52))
   20883              :           return x;
   20884              :         xa = xa + TWO52 - TWO52;
   20885              :         x2 = copysign (xa, x);
   20886              : 
   20887              :      Compensate.  Floor:
   20888              :         if (x2 > x)
   20889              :           x2 -= 1;
   20890              :      Compensate.  Ceil:
   20891              :         if (x2 < x)
   20892              :           x2 += 1;
   20893              : 
   20894              :         if (HONOR_SIGNED_ZEROS (mode))
   20895              :           x2 = copysign (x2, x);
   20896              :         return x2;
   20897              :    */
   20898            0 :   machine_mode mode = GET_MODE (operand0);
   20899            0 :   rtx xa, TWO52, tmp, one, res, mask;
   20900            0 :   rtx_code_label *label;
   20901              : 
   20902            0 :   TWO52 = ix86_gen_TWO52 (mode);
   20903              : 
   20904              :   /* Temporary for holding the result, initialized to the input
   20905              :      operand to ease control flow.  */
   20906            0 :   res = copy_to_reg (operand1);
   20907              : 
   20908              :   /* xa = abs (operand1) */
   20909            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   20910              : 
   20911              :   /* if (!isless (xa, TWO52)) goto label; */
   20912            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20913              : 
   20914              :   /* xa = xa + TWO52 - TWO52; */
   20915            0 :   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   20916            0 :   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
   20917              : 
   20918              :   /* xa = copysign (xa, operand1) */
   20919            0 :   ix86_sse_copysign_to_positive (xa, xa, res, mask);
   20920              : 
   20921              :   /* generate 1.0 */
   20922            0 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   20923              : 
   20924              :   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
   20925            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
   20926            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   20927            0 :   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
   20928              :                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   20929            0 :   if (HONOR_SIGNED_ZEROS (mode))
   20930              :     {
   20931              :       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20932            0 :       if (do_floor && flag_rounding_math)
   20933            0 :         tmp = ix86_expand_sse_fabs (tmp, NULL);
   20934              : 
   20935            0 :       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
   20936              :     }
   20937            0 :   emit_move_insn (res, tmp);
   20938              : 
   20939            0 :   emit_label (label);
   20940            0 :   LABEL_NUSES (label) = 1;
   20941              : 
   20942            0 :   emit_move_insn (operand0, res);
   20943            0 : }
   20944              : 
   20945              : /* Expand SSE sequence for computing trunc
   20946              :    from OPERAND1 storing into OPERAND0.  */
   20947              : void
   20948          321 : ix86_expand_trunc (rtx operand0, rtx operand1)
   20949              : {
   20950              :   /* C code for SSE variant we expand below.
   20951              :         double xa = fabs (x), x2;
   20952              :         if (!isless (xa, TWO52))
   20953              :           return x;
   20954              :         x2 = (double)(long)x;
   20955              :         if (HONOR_SIGNED_ZEROS (mode))
   20956              :           return copysign (x2, x);
   20957              :         return x2;
   20958              :    */
   20959          321 :   machine_mode mode = GET_MODE (operand0);
   20960          321 :   rtx xa, xi, TWO52, res, mask;
   20961          321 :   rtx_code_label *label;
   20962              : 
   20963          321 :   TWO52 = ix86_gen_TWO52 (mode);
   20964              : 
   20965              :   /* Temporary for holding the result, initialized to the input
   20966              :      operand to ease control flow.  */
   20967          321 :   res = copy_to_reg (operand1);
   20968              : 
   20969              :   /* xa = abs (operand1) */
   20970          321 :   xa = ix86_expand_sse_fabs (res, &mask);
   20971              : 
   20972              :   /* if (!isless (xa, TWO52)) goto label; */
   20973          321 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20974              : 
   20975              :   /* xa = (double)(long)x */
   20976          321 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   20977          321 :   expand_fix (xi, res, 0);
   20978          321 :   expand_float (xa, xi, 0);
   20979              : 
   20980          321 :   if (HONOR_SIGNED_ZEROS (mode))
   20981          307 :     ix86_sse_copysign_to_positive (xa, xa, res, mask);
   20982              : 
   20983          321 :   emit_move_insn (res, xa);
   20984              : 
   20985          321 :   emit_label (label);
   20986          321 :   LABEL_NUSES (label) = 1;
   20987              : 
   20988          321 :   emit_move_insn (operand0, res);
   20989          321 : }
   20990              : 
   20991              : /* Expand SSE sequence for computing trunc from OPERAND1 storing
   20992              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   20993              :    that is only available on 64bit targets.  */
   20994              : void
   20995            0 : ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
   20996              : {
   20997            0 :   machine_mode mode = GET_MODE (operand0);
   20998            0 :   rtx xa, xa2, TWO52, tmp, one, res, mask;
   20999            0 :   rtx_code_label *label;
   21000              : 
   21001              :   /* C code for SSE variant we expand below.
   21002              :         double xa = fabs (x), x2;
   21003              :         if (!isless (xa, TWO52))
   21004              :           return x;
   21005              :         xa2 = xa + TWO52 - TWO52;
   21006              :      Compensate:
   21007              :         if (xa2 > xa)
   21008              :           xa2 -= 1.0;
   21009              :         x2 = copysign (xa2, x);
   21010              :         return x2;
   21011              :    */
   21012              : 
   21013            0 :   TWO52 = ix86_gen_TWO52 (mode);
   21014              : 
   21015              :   /* Temporary for holding the result, initialized to the input
   21016              :      operand to ease control flow.  */
   21017            0 :   res =copy_to_reg (operand1);
   21018              : 
   21019              :   /* xa = abs (operand1) */
   21020            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   21021              : 
   21022              :   /* if (!isless (xa, TWO52)) goto label; */
   21023            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   21024              : 
   21025              :   /* xa2 = xa + TWO52 - TWO52; */
   21026            0 :   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   21027            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
   21028              : 
   21029              :   /* generate 1.0 */
   21030            0 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   21031              : 
   21032              :   /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
   21033            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
   21034            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   21035            0 :   tmp = expand_simple_binop (mode, MINUS,
   21036              :                              xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   21037              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   21038            0 :   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
   21039            0 :     tmp = ix86_expand_sse_fabs (tmp, NULL);
   21040              : 
   21041              :   /* res = copysign (xa2, operand1) */
   21042            0 :   ix86_sse_copysign_to_positive (res, tmp, res, mask);
   21043              : 
   21044            0 :   emit_label (label);
   21045            0 :   LABEL_NUSES (label) = 1;
   21046              : 
   21047            0 :   emit_move_insn (operand0, res);
   21048            0 : }
   21049              : 
   21050              : /* Expand SSE sequence for computing round
   21051              :    from OPERAND1 storing into OPERAND0.  */
   21052              : void
   21053           14 : ix86_expand_round (rtx operand0, rtx operand1)
   21054              : {
   21055              :   /* C code for the stuff we're doing below:
   21056              :         double xa = fabs (x);
   21057              :         if (!isless (xa, TWO52))
   21058              :           return x;
   21059              :         xa = (double)(long)(xa + nextafter (0.5, 0.0));
   21060              :         return copysign (xa, x);
   21061              :    */
   21062           14 :   machine_mode mode = GET_MODE (operand0);
   21063           14 :   rtx res, TWO52, xa, xi, half, mask;
   21064           14 :   rtx_code_label *label;
   21065           14 :   const struct real_format *fmt;
   21066           14 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   21067              : 
   21068              :   /* Temporary for holding the result, initialized to the input
   21069              :      operand to ease control flow.  */
   21070           14 :   res = copy_to_reg (operand1);
   21071              : 
   21072           14 :   TWO52 = ix86_gen_TWO52 (mode);
   21073           14 :   xa = ix86_expand_sse_fabs (res, &mask);
   21074           14 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   21075              : 
   21076              :   /* load nextafter (0.5, 0.0) */
   21077           14 :   fmt = REAL_MODE_FORMAT (mode);
   21078           14 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   21079           14 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   21080              : 
   21081              :   /* xa = xa + 0.5 */
   21082           14 :   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
   21083           14 :   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
   21084              : 
   21085              :   /* xa = (double)(int64_t)xa */
   21086           14 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   21087           14 :   expand_fix (xi, xa, 0);
   21088           14 :   expand_float (xa, xi, 0);
   21089              : 
   21090              :   /* res = copysign (xa, operand1) */
   21091           14 :   ix86_sse_copysign_to_positive (res, xa, res, mask);
   21092              : 
   21093           14 :   emit_label (label);
   21094           14 :   LABEL_NUSES (label) = 1;
   21095              : 
   21096           14 :   emit_move_insn (operand0, res);
   21097           14 : }
   21098              : 
   21099              : /* Expand SSE sequence for computing round from OPERAND1 storing
   21100              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   21101              :    that is only available on 64bit targets.  */
   21102              : void
   21103            0 : ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
   21104              : {
   21105              :   /* C code for the stuff we expand below.
   21106              :         double xa = fabs (x), xa2, x2;
   21107              :         if (!isless (xa, TWO52))
   21108              :           return x;
   21109              :      Using the absolute value and copying back sign makes
   21110              :      -0.0 -> -0.0 correct.
   21111              :         xa2 = xa + TWO52 - TWO52;
   21112              :      Compensate.
   21113              :         dxa = xa2 - xa;
   21114              :         if (dxa <= -0.5)
   21115              :           xa2 += 1;
   21116              :         else if (dxa > 0.5)
   21117              :           xa2 -= 1;
   21118              :         x2 = copysign (xa2, x);
   21119              :         return x2;
   21120              :    */
   21121            0 :   machine_mode mode = GET_MODE (operand0);
   21122            0 :   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
   21123            0 :   rtx_code_label *label;
   21124              : 
   21125            0 :   TWO52 = ix86_gen_TWO52 (mode);
   21126              : 
   21127              :   /* Temporary for holding the result, initialized to the input
   21128              :      operand to ease control flow.  */
   21129            0 :   res = copy_to_reg (operand1);
   21130              : 
   21131              :   /* xa = abs (operand1) */
   21132            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   21133              : 
   21134              :   /* if (!isless (xa, TWO52)) goto label; */
   21135            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   21136              : 
   21137              :   /* xa2 = xa + TWO52 - TWO52; */
   21138            0 :   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   21139            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
   21140              : 
   21141              :   /* dxa = xa2 - xa; */
   21142            0 :   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
   21143              : 
   21144              :   /* generate 0.5, 1.0 and -0.5 */
   21145            0 :   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
   21146            0 :   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
   21147            0 :   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
   21148              :                                0, OPTAB_DIRECT);
   21149              : 
   21150              :   /* Compensate.  */
   21151              :   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
   21152            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
   21153            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   21154            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   21155              :   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
   21156            0 :   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
   21157            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   21158            0 :   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   21159              : 
   21160              :   /* res = copysign (xa2, operand1) */
   21161            0 :   ix86_sse_copysign_to_positive (res, xa2, res, mask);
   21162              : 
   21163            0 :   emit_label (label);
   21164            0 :   LABEL_NUSES (label) = 1;
   21165              : 
   21166            0 :   emit_move_insn (operand0, res);
   21167            0 : }
   21168              : 
   21169              : /* Expand SSE sequence for computing round
   21170              :    from OP1 storing into OP0 using sse4 round insn.  */
   21171              : void
   21172            9 : ix86_expand_round_sse4 (rtx op0, rtx op1)
   21173              : {
   21174            9 :   machine_mode mode = GET_MODE (op0);
   21175            9 :   rtx e1, e2, res, half;
   21176            9 :   const struct real_format *fmt;
   21177            9 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   21178            9 :   rtx (*gen_copysign) (rtx, rtx, rtx);
   21179            9 :   rtx (*gen_round) (rtx, rtx, rtx);
   21180              : 
   21181            9 :   switch (mode)
   21182              :     {
   21183              :     case E_HFmode:
   21184              :       gen_copysign = gen_copysignhf3;
   21185              :       gen_round = gen_sse4_1_roundhf2;
   21186              :       break;
   21187            4 :     case E_SFmode:
   21188            4 :       gen_copysign = gen_copysignsf3;
   21189            4 :       gen_round = gen_sse4_1_roundsf2;
   21190            4 :       break;
   21191            4 :     case E_DFmode:
   21192            4 :       gen_copysign = gen_copysigndf3;
   21193            4 :       gen_round = gen_sse4_1_rounddf2;
   21194            4 :       break;
   21195            0 :     default:
   21196            0 :       gcc_unreachable ();
   21197              :     }
   21198              : 
   21199              :   /* round (a) = trunc (a + copysign (0.5, a)) */
   21200              : 
   21201              :   /* load nextafter (0.5, 0.0) */
   21202            9 :   fmt = REAL_MODE_FORMAT (mode);
   21203            9 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   21204            9 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   21205            9 :   half = const_double_from_real_value (pred_half, mode);
   21206              : 
   21207              :   /* e1 = copysign (0.5, op1) */
   21208            9 :   e1 = gen_reg_rtx (mode);
   21209            9 :   emit_insn (gen_copysign (e1, half, op1));
   21210              : 
   21211              :   /* e2 = op1 + e1 */
   21212            9 :   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
   21213              : 
   21214              :   /* res = trunc (e2) */
   21215            9 :   res = gen_reg_rtx (mode);
   21216            9 :   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
   21217              : 
   21218            9 :   emit_move_insn (op0, res);
   21219            9 : }
   21220              : 
   21221              : /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
   21222              :    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
   21223              :    insn every time.  */
   21224              : 
   21225              : static GTY(()) rtx_insn *vselect_insn;
   21226              : 
   21227              : /* Initialize vselect_insn.  */
   21228              : 
   21229              : static void
   21230         7588 : init_vselect_insn (void)
   21231              : {
   21232         7588 :   unsigned i;
   21233         7588 :   rtx x;
   21234              : 
   21235         7588 :   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
   21236       493220 :   for (i = 0; i < MAX_VECT_LEN; ++i)
   21237       485632 :     XVECEXP (x, 0, i) = const0_rtx;
   21238         7588 :   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
   21239              :                                                         const0_rtx), x);
   21240         7588 :   x = gen_rtx_SET (const0_rtx, x);
   21241         7588 :   start_sequence ();
   21242         7588 :   vselect_insn = emit_insn (x);
   21243         7588 :   end_sequence ();
   21244         7588 : }
   21245              : 
   21246              : /* Construct (set target (vec_select op0 (parallel perm))) and
   21247              :    return true if that's a valid instruction in the active ISA.  */
   21248              : 
   21249              : static bool
   21250       536360 : expand_vselect (rtx target, rtx op0, const unsigned char *perm,
   21251              :                 unsigned nelt, bool testing_p)
   21252              : {
   21253       536360 :   unsigned int i;
   21254       536360 :   rtx x, save_vconcat;
   21255       536360 :   int icode;
   21256              : 
   21257       536360 :   if (vselect_insn == NULL_RTX)
   21258         1678 :     init_vselect_insn ();
   21259              : 
   21260       536360 :   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
   21261       536360 :   PUT_NUM_ELEM (XVEC (x, 0), nelt);
   21262      4118296 :   for (i = 0; i < nelt; ++i)
   21263      3581936 :     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
   21264       536360 :   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
   21265       536360 :   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
   21266       536360 :   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
   21267       536360 :   SET_DEST (PATTERN (vselect_insn)) = target;
   21268       536360 :   icode = recog_memoized (vselect_insn);
   21269              : 
   21270       536360 :   if (icode >= 0 && !testing_p)
   21271        72071 :     emit_insn (copy_rtx (PATTERN (vselect_insn)));
   21272              : 
   21273       536360 :   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
   21274       536360 :   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
   21275       536360 :   INSN_CODE (vselect_insn) = -1;
   21276              : 
   21277       536360 :   return icode >= 0;
   21278              : }
   21279              : 
   21280              : /* Similar, but generate a vec_concat from op0 and op1 as well.  */
   21281              : 
   21282              : static bool
   21283       471081 : expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   21284              :                         const unsigned char *perm, unsigned nelt,
   21285              :                         bool testing_p)
   21286              : {
   21287       471081 :   machine_mode v2mode;
   21288       471081 :   rtx x;
   21289       471081 :   bool ok;
   21290              : 
   21291       471081 :   if (vselect_insn == NULL_RTX)
   21292         5910 :     init_vselect_insn ();
   21293              : 
   21294       471081 :   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
   21295              :     return false;
   21296       471081 :   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
   21297       471081 :   PUT_MODE (x, v2mode);
   21298       471081 :   XEXP (x, 0) = op0;
   21299       471081 :   XEXP (x, 1) = op1;
   21300       471081 :   ok = expand_vselect (target, x, perm, nelt, testing_p);
   21301       471081 :   XEXP (x, 0) = const0_rtx;
   21302       471081 :   XEXP (x, 1) = const0_rtx;
   21303       471081 :   return ok;
   21304              : }
   21305              : 
   21306              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21307              :    using movss or movsd.  */
   21308              : static bool
   21309       340095 : expand_vec_perm_movs (struct expand_vec_perm_d *d)
   21310              : {
   21311       340095 :   machine_mode vmode = d->vmode;
   21312       340095 :   unsigned i, nelt = d->nelt;
   21313       340095 :   rtx x;
   21314              : 
   21315       340095 :   if (d->one_operand_p)
   21316              :     return false;
   21317              : 
   21318       314388 :   if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
   21319       165506 :       && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
   21320        84815 :       && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
   21321              :     return false;
   21322              : 
   21323              :   /* Only the first element is changed.  */
   21324       238677 :   if (d->perm[0] != nelt && d->perm[0] != 0)
   21325              :     return false;
   21326       203853 :   for (i = 1; i < nelt; ++i)
   21327       147808 :     if (d->perm[i] != i + nelt - d->perm[0])
   21328              :       return false;
   21329              : 
   21330        56045 :   if (d->testing_p)
   21331              :     return true;
   21332              : 
   21333         6592 :   if (d->perm[0] == nelt)
   21334            0 :     x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
   21335              :   else
   21336         6592 :     x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
   21337              : 
   21338         6592 :   emit_insn (gen_rtx_SET (d->target, x));
   21339              : 
   21340         6592 :   return true;
   21341              : }
   21342              : 
   21343              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21344              :    using insertps.  */
   21345              : static bool
   21346       284050 : expand_vec_perm_insertps (struct expand_vec_perm_d *d)
   21347              : {
   21348       284050 :   machine_mode vmode = d->vmode;
   21349       284050 :   unsigned i, cnt_s, nelt = d->nelt;
   21350       284050 :   int cnt_d = -1;
   21351       284050 :   rtx src, dst;
   21352              : 
   21353       284050 :   if (d->one_operand_p)
   21354              :     return false;
   21355              : 
   21356       258343 :   if (!(TARGET_SSE4_1
   21357        37743 :         && (vmode == V4SFmode || vmode == V4SImode
   21358        27498 :             || (TARGET_MMX_WITH_SSE
   21359        21234 :                 && (vmode == V2SFmode || vmode == V2SImode)))))
   21360              :     return false;
   21361              : 
   21362        55746 :   for (i = 0; i < nelt; ++i)
   21363              :     {
   21364        52491 :       if (d->perm[i] == i)
   21365        10620 :         continue;
   21366        41871 :       if (cnt_d != -1)
   21367              :         {
   21368              :           cnt_d = -1;
   21369              :           break;
   21370              :         }
   21371        22563 :       cnt_d = i;
   21372              :     }
   21373              : 
   21374        22563 :   if (cnt_d == -1)
   21375              :     {
   21376        43673 :       for (i = 0; i < nelt; ++i)
   21377              :         {
   21378        40860 :           if (d->perm[i] == i + nelt)
   21379         5057 :             continue;
   21380        35803 :           if (cnt_d != -1)
   21381              :             return false;
   21382        19308 :           cnt_d = i;
   21383              :         }
   21384              : 
   21385         2813 :       if (cnt_d == -1)
   21386              :         return false;
   21387              :     }
   21388              : 
   21389         6068 :   if (d->testing_p)
   21390              :     return true;
   21391              : 
   21392          524 :   gcc_assert (cnt_d != -1);
   21393              : 
   21394          524 :   cnt_s = d->perm[cnt_d];
   21395          524 :   if (cnt_s < nelt)
   21396              :     {
   21397          221 :       src = d->op0;
   21398          221 :       dst = d->op1;
   21399              :     }
   21400              :   else
   21401              :     {
   21402          303 :       cnt_s -= nelt;
   21403          303 :       src = d->op1;
   21404          303 :       dst = d->op0;
   21405              :      }
   21406          524 :   gcc_assert (cnt_s < nelt);
   21407              : 
   21408          524 :   rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
   21409          524 :                                GEN_INT (cnt_s << 6 | cnt_d << 4));
   21410          524 :   emit_insn (x);
   21411              : 
   21412          524 :   return true;
   21413              : }
   21414              : 
   21415              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21416              :    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
   21417              : 
   21418              : static bool
   21419       344817 : expand_vec_perm_blend (struct expand_vec_perm_d *d)
   21420              : {
   21421       344817 :   machine_mode mmode, vmode = d->vmode;
   21422       344817 :   unsigned i, nelt = d->nelt;
   21423       344817 :   unsigned HOST_WIDE_INT mask;
   21424       344817 :   rtx target, op0, op1, maskop, x;
   21425       344817 :   rtx rperm[32], vperm;
   21426              : 
   21427       344817 :   if (d->one_operand_p)
   21428              :     return false;
   21429         6675 :   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
   21430       320136 :       && (TARGET_AVX512BW
   21431          661 :           || GET_MODE_UNIT_SIZE (vmode) >= 4))
   21432              :     ;
   21433       331055 :   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   21434              :     ;
   21435       312685 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   21436              :     ;
   21437       307590 :   else if (TARGET_SSE4_1
   21438       343547 :            && (GET_MODE_SIZE (vmode) == 16
   21439        29734 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   21440         3636 :                || GET_MODE_SIZE (vmode) == 4))
   21441              :     ;
   21442              :   else
   21443              :     return false;
   21444              : 
   21445              :   /* This is a blend, not a permute.  Elements must stay in their
   21446              :      respective lanes.  */
   21447        97298 :   for (i = 0; i < nelt; ++i)
   21448              :     {
   21449        92576 :       unsigned e = d->perm[i];
   21450        92576 :       if (!(e == i || e == i + nelt))
   21451              :         return false;
   21452              :     }
   21453              : 
   21454         4722 :   if (d->testing_p)
   21455              :     return true;
   21456              : 
   21457              :   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
   21458              :      decision should be extracted elsewhere, so that we only try that
   21459              :      sequence once all budget==3 options have been tried.  */
   21460         2786 :   target = d->target;
   21461         2786 :   op0 = d->op0;
   21462         2786 :   op1 = d->op1;
   21463         2786 :   mask = 0;
   21464              : 
   21465         2786 :   switch (vmode)
   21466              :     {
   21467              :     case E_V8DFmode:
   21468              :     case E_V16SFmode:
   21469              :     case E_V4DFmode:
   21470              :     case E_V8SFmode:
   21471              :     case E_V2DFmode:
   21472              :     case E_V4SFmode:
   21473              :     case E_V2SFmode:
   21474              :     case E_V2HImode:
   21475              :     case E_V4HImode:
   21476              :     case E_V8HImode:
   21477              :     case E_V8SImode:
   21478              :     case E_V32HImode:
   21479              :     case E_V64QImode:
   21480              :     case E_V16SImode:
   21481              :     case E_V8DImode:
   21482        10967 :       for (i = 0; i < nelt; ++i)
   21483         9472 :         mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
   21484              :       break;
   21485              : 
   21486              :     case E_V2DImode:
   21487           18 :       for (i = 0; i < 2; ++i)
   21488           18 :         mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
   21489            6 :       vmode = V8HImode;
   21490            6 :       goto do_subreg;
   21491              : 
   21492              :     case E_V2SImode:
   21493           24 :       for (i = 0; i < 2; ++i)
   21494           24 :         mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
   21495            8 :       vmode = V4HImode;
   21496            8 :       goto do_subreg;
   21497              : 
   21498          871 :     case E_V4SImode:
   21499          871 :       if (TARGET_AVX2)
   21500              :         {
   21501              :           /* Use vpblendd instead of vpblendw.  */
   21502          185 :           for (i = 0; i < nelt; ++i)
   21503          148 :             mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
   21504              :           break;
   21505              :         }
   21506              :       else
   21507              :         {
   21508         4170 :           for (i = 0; i < 4; ++i)
   21509         5200 :             mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
   21510          834 :           vmode = V8HImode;
   21511          834 :           goto do_subreg;
   21512              :         }
   21513              : 
   21514              :     case E_V16QImode:
   21515              :       /* See if bytes move in pairs so we can use pblendw with
   21516              :          an immediate argument, rather than pblendvb with a vector
   21517              :          argument.  */
   21518          102 :       for (i = 0; i < 16; i += 2)
   21519          100 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21520              :           {
   21521           83 :           use_pblendvb:
   21522         3502 :             for (i = 0; i < nelt; ++i)
   21523         3212 :               rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
   21524              : 
   21525          290 :           finish_pblendvb:
   21526          291 :             vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
   21527          291 :             vperm = force_reg (vmode, vperm);
   21528              : 
   21529          582 :             if (GET_MODE_SIZE (vmode) == 4)
   21530          135 :               emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
   21531          312 :             else if (GET_MODE_SIZE (vmode) == 8)
   21532           40 :               emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
   21533          232 :             else if (GET_MODE_SIZE (vmode) == 16)
   21534           83 :               emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
   21535              :             else
   21536           33 :               emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
   21537          291 :             if (target != d->target)
   21538            1 :               emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21539          291 :             return true;
   21540              :           }
   21541              : 
   21542           18 :       for (i = 0; i < 8; ++i)
   21543           16 :         mask |= (d->perm[i * 2] >= 16) << i;
   21544              :       vmode = V8HImode;
   21545              :       /* FALLTHRU */
   21546              : 
   21547          931 :     do_subreg:
   21548          931 :       target = gen_reg_rtx (vmode);
   21549          931 :       op0 = gen_lowpart (vmode, op0);
   21550          931 :       op1 = gen_lowpart (vmode, op1);
   21551          931 :       break;
   21552              : 
   21553              :     case E_V8QImode:
   21554           40 :       for (i = 0; i < 8; i += 2)
   21555           40 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21556           40 :           goto use_pblendvb;
   21557              : 
   21558            0 :       for (i = 0; i < 4; ++i)
   21559            0 :         mask |= (d->perm[i * 2] >= 8) << i;
   21560            0 :       vmode = V4HImode;
   21561            0 :       goto do_subreg;
   21562              : 
   21563              :     case E_V4QImode:
   21564          153 :       for (i = 0; i < 4; i += 2)
   21565          150 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21566          135 :           goto use_pblendvb;
   21567              : 
   21568            9 :       for (i = 0; i < 2; ++i)
   21569            6 :         mask |= (d->perm[i * 2] >= 4) << i;
   21570            3 :       vmode = V2HImode;
   21571            3 :       goto do_subreg;
   21572              : 
   21573              :     case E_V32QImode:
   21574              :       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
   21575          916 :       for (i = 0; i < 32; i += 2)
   21576          864 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21577           32 :           goto use_pblendvb;
   21578              :       /* See if bytes move in quadruplets.  If yes, vpblendd
   21579              :          with immediate can be used.  */
   21580          468 :       for (i = 0; i < 32; i += 4)
   21581          416 :         if (d->perm[i] + 2 != d->perm[i + 2])
   21582              :           break;
   21583           52 :       if (i < 32)
   21584              :         {
   21585              :           /* See if bytes move the same in both lanes.  If yes,
   21586              :              vpblendw with immediate can be used.  */
   21587            0 :           for (i = 0; i < 16; i += 2)
   21588            0 :             if (d->perm[i] + 16 != d->perm[i + 16])
   21589            0 :               goto use_pblendvb;
   21590              : 
   21591              :           /* Use vpblendw.  */
   21592            0 :           for (i = 0; i < 16; ++i)
   21593            0 :             mask |= (d->perm[i * 2] >= 32) << i;
   21594            0 :           vmode = V16HImode;
   21595            0 :           goto do_subreg;
   21596              :         }
   21597              : 
   21598              :       /* Use vpblendd.  */
   21599          468 :       for (i = 0; i < 8; ++i)
   21600          416 :         mask |= (d->perm[i * 4] >= 32) << i;
   21601           52 :       vmode = V8SImode;
   21602           52 :       goto do_subreg;
   21603              : 
   21604              :     case E_V16HImode:
   21605              :       /* See if words move in pairs.  If yes, vpblendd can be used.  */
   21606          186 :       for (i = 0; i < 16; i += 2)
   21607          169 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21608              :           break;
   21609           50 :       if (i < 16)
   21610              :         {
   21611              :           /* See if words move the same in both lanes.  If not,
   21612              :              vpblendvb must be used.  */
   21613          290 :           for (i = 0; i < 8; i++)
   21614          258 :             if (d->perm[i] + 8 != d->perm[i + 8])
   21615              :               {
   21616              :                 /* Use vpblendvb.  */
   21617           33 :                 for (i = 0; i < 32; ++i)
   21618           32 :                   rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
   21619              : 
   21620            1 :                 vmode = V32QImode;
   21621            1 :                 nelt = 32;
   21622            1 :                 target = gen_reg_rtx (vmode);
   21623            1 :                 op0 = gen_lowpart (vmode, op0);
   21624            1 :                 op1 = gen_lowpart (vmode, op1);
   21625            1 :                 goto finish_pblendvb;
   21626              :               }
   21627              : 
   21628              :           /* Use vpblendw.  */
   21629          544 :           for (i = 0; i < 16; ++i)
   21630          512 :             mask |= (d->perm[i] >= 16) << i;
   21631              :           break;
   21632              :         }
   21633              : 
   21634              :       /* Use vpblendd.  */
   21635          153 :       for (i = 0; i < 8; ++i)
   21636          136 :         mask |= (d->perm[i * 2] >= 16) << i;
   21637           17 :       vmode = V8SImode;
   21638           17 :       goto do_subreg;
   21639              : 
   21640              :     case E_V4DImode:
   21641              :       /* Use vpblendd.  */
   21642           45 :       for (i = 0; i < 4; ++i)
   21643           54 :         mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
   21644            9 :       vmode = V8SImode;
   21645            9 :       goto do_subreg;
   21646              : 
   21647            0 :     default:
   21648            0 :       gcc_unreachable ();
   21649              :     }
   21650              : 
   21651         2495 :   switch (vmode)
   21652              :     {
   21653              :     case E_V8DFmode:
   21654              :     case E_V8DImode:
   21655              :       mmode = QImode;
   21656              :       break;
   21657            5 :     case E_V16SFmode:
   21658            5 :     case E_V16SImode:
   21659            5 :       mmode = HImode;
   21660            5 :       break;
   21661            6 :     case E_V32HImode:
   21662            6 :       mmode = SImode;
   21663            6 :       break;
   21664            1 :     case E_V64QImode:
   21665            1 :       mmode = DImode;
   21666            1 :       break;
   21667              :     default:
   21668              :       mmode = VOIDmode;
   21669              :     }
   21670              : 
   21671              :   /* Canonicalize vec_merge.  */
   21672         2495 :   if (swap_commutative_operands_p (op1, op0)
   21673              :       /* Two operands have same precedence, then
   21674              :          first bit of mask select first operand.  */
   21675         2495 :       || (!swap_commutative_operands_p (op0, op1)
   21676         2495 :           && !(mask & 1)))
   21677              :     {
   21678         2488 :       unsigned n_elts = GET_MODE_NUNITS (vmode);
   21679         2488 :       std::swap (op0, op1);
   21680         2488 :       unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
   21681         2488 :       if (n_elts == HOST_BITS_PER_WIDE_INT)
   21682              :         mask_all  = -1;
   21683              :       else
   21684         2487 :         mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
   21685         2488 :       mask = ~mask & mask_all;
   21686              :     }
   21687              : 
   21688         2495 :   if (mmode != VOIDmode)
   21689           20 :     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
   21690              :   else
   21691         2475 :     maskop = GEN_INT (mask);
   21692              : 
   21693              :   /* This matches five different patterns with the different modes.  */
   21694         2495 :   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
   21695         2495 :   x = gen_rtx_SET (target, x);
   21696         2495 :   emit_insn (x);
   21697         2495 :   if (target != d->target)
   21698          931 :     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21699              : 
   21700              :   return true;
   21701              : }
   21702              : 
   21703              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21704              :    in terms of the variable form of vpermilps.
   21705              : 
   21706              :    Note that we will have already failed the immediate input vpermilps,
   21707              :    which requires that the high and low part shuffle be identical; the
   21708              :    variable form doesn't require that.  */
   21709              : 
   21710              : static bool
   21711       135971 : expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
   21712              : {
   21713       135971 :   rtx rperm[8], vperm;
   21714       135971 :   unsigned i;
   21715              : 
   21716       135971 :   if (!TARGET_AVX || !d->one_operand_p
   21717        11182 :       || (d->vmode != V8SImode && d->vmode != V8SFmode))
   21718              :     return false;
   21719              : 
   21720              :   /* We can only permute within the 128-bit lane.  */
   21721        16201 :   for (i = 0; i < 8; ++i)
   21722              :     {
   21723        15551 :       unsigned e = d->perm[i];
   21724        15551 :       if (i < 4 ? e >= 4 : e < 4)
   21725              :         return false;
   21726              :     }
   21727              : 
   21728          650 :   if (d->testing_p)
   21729              :     return true;
   21730              : 
   21731          657 :   for (i = 0; i < 8; ++i)
   21732              :     {
   21733          584 :       unsigned e = d->perm[i];
   21734              : 
   21735              :       /* Within each 128-bit lane, the elements of op0 are numbered
   21736              :          from 0 and the elements of op1 are numbered from 4.  */
   21737          584 :       if (e >= 8 + 4)
   21738            0 :         e -= 8;
   21739          584 :       else if (e >= 4)
   21740          292 :         e -= 4;
   21741              : 
   21742          584 :       rperm[i] = GEN_INT (e);
   21743              :     }
   21744              : 
   21745           73 :   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
   21746           73 :   vperm = force_reg (V8SImode, vperm);
   21747           73 :   rtx target = d->target;
   21748           73 :   rtx op0 = d->op0;
   21749           73 :   if (d->vmode == V8SImode)
   21750              :     {
   21751           21 :       target = lowpart_subreg (V8SFmode, target, V8SImode);
   21752           21 :       op0 = lowpart_subreg (V8SFmode, op0, V8SImode);
   21753              :     }
   21754              : 
   21755           73 :   emit_insn (gen_avx_vpermilvarv8sf3 (target, op0, vperm));
   21756              : 
   21757           73 :   return true;
   21758              : }
   21759              : 
   21760              : /* For V*[QHS]Imode permutations, check if the same permutation
   21761              :    can't be performed in a 2x, 4x or 8x wider inner mode.  */
   21762              : 
   21763              : static bool
   21764       159340 : canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
   21765              :                               struct expand_vec_perm_d *nd)
   21766              : {
   21767       159340 :   int i;
   21768       159340 :   machine_mode mode = VOIDmode;
   21769              : 
   21770       159340 :   switch (d->vmode)
   21771              :     {
   21772              :     case E_V8QImode: mode = V4HImode; break;
   21773        29404 :     case E_V16QImode: mode = V8HImode; break;
   21774          715 :     case E_V32QImode: mode = V16HImode; break;
   21775          275 :     case E_V64QImode: mode = V32HImode; break;
   21776        11971 :     case E_V4HImode: mode = V2SImode; break;
   21777        20468 :     case E_V8HImode: mode = V4SImode; break;
   21778         1006 :     case E_V16HImode: mode = V8SImode; break;
   21779          397 :     case E_V32HImode: mode = V16SImode; break;
   21780        40471 :     case E_V4SImode: mode = V2DImode; break;
   21781         1485 :     case E_V8SImode: mode = V4DImode; break;
   21782           65 :     case E_V16SImode: mode = V8DImode; break;
   21783              :     default: return false;
   21784              :     }
   21785       200982 :   for (i = 0; i < d->nelt; i += 2)
   21786       186644 :     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
   21787              :       return false;
   21788        14338 :   nd->vmode = mode;
   21789        14338 :   nd->nelt = d->nelt / 2;
   21790        93646 :   for (i = 0; i < nd->nelt; i++)
   21791        79308 :     nd->perm[i] = d->perm[2 * i] / 2;
   21792        28676 :   if (GET_MODE_INNER (mode) != DImode)
   21793        12606 :     canonicalize_vector_int_perm (nd, nd);
   21794        14338 :   if (nd != d)
   21795              :     {
   21796         9095 :       nd->one_operand_p = d->one_operand_p;
   21797         9095 :       nd->testing_p = d->testing_p;
   21798         9095 :       if (d->op0 == d->op1)
   21799         3031 :         nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
   21800              :       else
   21801              :         {
   21802         6064 :           nd->op0 = gen_lowpart (nd->vmode, d->op0);
   21803         6064 :           nd->op1 = gen_lowpart (nd->vmode, d->op1);
   21804              :         }
   21805         9095 :       if (d->testing_p)
   21806         5832 :         nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
   21807              :       else
   21808         3263 :         nd->target = gen_reg_rtx (nd->vmode);
   21809              :     }
   21810              :   return true;
   21811              : }
   21812              : 
   21813              : /* Return true if permutation D can be performed as VMODE permutation
   21814              :    instead.  */
   21815              : 
   21816              : static bool
   21817         5994 : valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
   21818              : {
   21819         5994 :   unsigned int i, j, chunk;
   21820              : 
   21821         5994 :   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
   21822         5994 :       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
   21823        14742 :       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
   21824              :     return false;
   21825              : 
   21826         8748 :   if (GET_MODE_NUNITS (vmode) >= d->nelt)
   21827              :     return true;
   21828              : 
   21829         4086 :   chunk = d->nelt / GET_MODE_NUNITS (vmode);
   21830         5328 :   for (i = 0; i < d->nelt; i += chunk)
   21831         5081 :     if (d->perm[i] & (chunk - 1))
   21832              :       return false;
   21833              :     else
   21834         7759 :       for (j = 1; j < chunk; ++j)
   21835         6517 :         if (d->perm[i] + j != d->perm[i + j])
   21836              :           return false;
   21837              : 
   21838              :   return true;
   21839              : }
   21840              : 
   21841              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21842              :    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
   21843              : 
   21844              : static bool
   21845       135321 : expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   21846              : {
   21847       135321 :   unsigned i, nelt, eltsz, mask;
   21848       135321 :   unsigned char perm[64];
   21849       135321 :   machine_mode vmode;
   21850       135321 :   struct expand_vec_perm_d nd;
   21851       135321 :   rtx rperm[64], vperm, target, op0, op1;
   21852              : 
   21853       135321 :   nelt = d->nelt;
   21854              : 
   21855       135321 :   if (!d->one_operand_p)
   21856       220736 :     switch (GET_MODE_SIZE (d->vmode))
   21857              :       {
   21858         8329 :       case 4:
   21859         8329 :         if (!TARGET_XOP)
   21860              :           return false;
   21861              :         vmode = V4QImode;
   21862              :         break;
   21863              : 
   21864        19530 :       case 8:
   21865        19530 :         if (!TARGET_XOP)
   21866              :           return false;
   21867              :         vmode = V8QImode;
   21868              :         break;
   21869              : 
   21870        73595 :       case 16:
   21871        73595 :         if (!TARGET_XOP)
   21872              :           return false;
   21873              :         vmode = V16QImode;
   21874              :         break;
   21875              : 
   21876         8004 :       case 32:
   21877         8004 :         if (!TARGET_AVX2)
   21878              :           return false;
   21879              : 
   21880         4014 :         if (valid_perm_using_mode_p (V2TImode, d))
   21881              :           {
   21882           56 :             if (d->testing_p)
   21883              :               return true;
   21884              : 
   21885              :             /* Use vperm2i128 insn.  The pattern uses
   21886              :                V4DImode instead of V2TImode.  */
   21887           52 :             target = d->target;
   21888           52 :             if (d->vmode != V4DImode)
   21889           12 :               target = gen_reg_rtx (V4DImode);
   21890           52 :             op0 = gen_lowpart (V4DImode, d->op0);
   21891           52 :             op1 = gen_lowpart (V4DImode, d->op1);
   21892           52 :             rperm[0]
   21893           52 :               = GEN_INT ((d->perm[0] / (nelt / 2))
   21894              :                          | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
   21895           52 :             emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
   21896           52 :             if (target != d->target)
   21897           12 :               emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21898           52 :             return true;
   21899              :           }
   21900              :         /* FALLTHRU */
   21901              : 
   21902              :       default:
   21903              :         return false;
   21904              :       }
   21905              :   else
   21906        49906 :     switch (GET_MODE_SIZE (d->vmode))
   21907              :       {
   21908         3534 :       case 4:
   21909         3534 :         if (!TARGET_SSSE3)
   21910              :           return false;
   21911              :         vmode = V4QImode;
   21912              :         break;
   21913              : 
   21914         2430 :       case 8:
   21915         2430 :         if (!TARGET_SSSE3)
   21916              :           return false;
   21917              :         vmode = V8QImode;
   21918              :         break;
   21919              : 
   21920        14040 :       case 16:
   21921        14040 :         if (!TARGET_SSSE3)
   21922              :           return false;
   21923              :         vmode = V16QImode;
   21924              :         break;
   21925              : 
   21926         4560 :       case 32:
   21927         4560 :         if (!TARGET_AVX2)
   21928              :           return false;
   21929              : 
   21930              :         /* V4DImode should be already handled through
   21931              :            expand_vselect by vpermq instruction.  */
   21932         1995 :         gcc_assert (d->vmode != V4DImode);
   21933              : 
   21934         1995 :         vmode = V32QImode;
   21935         1995 :         if (d->vmode == V8SImode
   21936         1606 :             || d->vmode == V16HImode
   21937         1390 :             || d->vmode == V32QImode)
   21938              :           {
   21939              :             /* First see if vpermq can be used for
   21940              :                V8SImode/V16HImode/V32QImode.  */
   21941          903 :             if (valid_perm_using_mode_p (V4DImode, d))
   21942              :               {
   21943          770 :                 for (i = 0; i < 4; i++)
   21944          616 :                   perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
   21945          154 :                 if (d->testing_p)
   21946              :                   return true;
   21947           58 :                 target = gen_reg_rtx (V4DImode);
   21948           58 :                 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
   21949              :                                     perm, 4, false))
   21950              :                   {
   21951          116 :                     emit_move_insn (d->target,
   21952           58 :                                     gen_lowpart (d->vmode, target));
   21953           58 :                     return true;
   21954              :                   }
   21955              :                 return false;
   21956              :               }
   21957              : 
   21958              :             /* Next see if vpermd can be used.  */
   21959          749 :             if (valid_perm_using_mode_p (V8SImode, d))
   21960              :               vmode = V8SImode;
   21961              :           }
   21962              :         /* Or if vpermps can be used.  */
   21963         1092 :         else if (d->vmode == V8SFmode)
   21964              :           vmode = V8SImode;
   21965              : 
   21966              :         if (vmode == V32QImode)
   21967              :           {
   21968              :             /* vpshufb only works intra lanes, it is not
   21969              :                possible to shuffle bytes in between the lanes.  */
   21970         6473 :             for (i = 0; i < nelt; ++i)
   21971         6291 :               if ((d->perm[i] ^ i) & (nelt / 2))
   21972              :                 return false;
   21973              :           }
   21974              :         break;
   21975              : 
   21976          389 :       case 64:
   21977          389 :         if (!TARGET_AVX512BW)
   21978              :           return false;
   21979              : 
   21980              :         /* If vpermq didn't work, vpshufb won't work either.  */
   21981          204 :         if (d->vmode == V8DFmode || d->vmode == V8DImode)
   21982              :           return false;
   21983              : 
   21984          175 :         vmode = V64QImode;
   21985          175 :         if (d->vmode == V16SImode
   21986          150 :             || d->vmode == V32HImode
   21987           50 :             || d->vmode == V64QImode)
   21988              :           {
   21989              :             /* First see if vpermq can be used for
   21990              :                V16SImode/V32HImode/V64QImode.  */
   21991          164 :             if (valid_perm_using_mode_p (V8DImode, d))
   21992              :               {
   21993            0 :                 for (i = 0; i < 8; i++)
   21994            0 :                   perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
   21995            0 :                 if (d->testing_p)
   21996              :                   return true;
   21997            0 :                 target = gen_reg_rtx (V8DImode);
   21998            0 :                 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
   21999              :                                     perm, 8, false))
   22000              :                   {
   22001            0 :                     emit_move_insn (d->target,
   22002            0 :                                     gen_lowpart (d->vmode, target));
   22003            0 :                     return true;
   22004              :                   }
   22005              :                 return false;
   22006              :               }
   22007              : 
   22008              :             /* Next see if vpermd can be used.  */
   22009          164 :             if (valid_perm_using_mode_p (V16SImode, d))
   22010              :               vmode = V16SImode;
   22011              :           }
   22012              :         /* Or if vpermps can be used.  */
   22013           11 :         else if (d->vmode == V16SFmode)
   22014              :           vmode = V16SImode;
   22015              : 
   22016              :         if (vmode == V64QImode)
   22017              :           {
   22018              :             /* vpshufb only works intra lanes, it is not
   22019              :                possible to shuffle bytes in between the lanes.  */
   22020          578 :             for (i = 0; i < nelt; ++i)
   22021          578 :               if ((d->perm[i] ^ i) & (3 * nelt / 4))
   22022              :                 return false;
   22023              :           }
   22024              :         break;
   22025              : 
   22026              :       default:
   22027              :         return false;
   22028              :       }
   22029              : 
   22030        11646 :   if (d->testing_p)
   22031              :     return true;
   22032              : 
   22033              :   /* Try to avoid variable permutation instruction.  */
   22034         8854 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   22035              :     {
   22036         1839 :       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   22037         1839 :       return true;
   22038              :     }
   22039              : 
   22040         7015 :   if (vmode == V8SImode)
   22041         9603 :     for (i = 0; i < 8; ++i)
   22042         8536 :       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
   22043         5948 :   else if (vmode == V16SImode)
   22044          612 :     for (i = 0; i < 16; ++i)
   22045          576 :       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
   22046              :   else
   22047              :     {
   22048         5912 :       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   22049         5912 :       if (!d->one_operand_p)
   22050         3212 :         mask = 2 * nelt - 1;
   22051         2700 :       else if (vmode == V64QImode)
   22052            0 :         mask = nelt / 4 - 1;
   22053         2700 :       else if (vmode == V32QImode)
   22054          176 :         mask = nelt / 2 - 1;
   22055              :       else
   22056         2524 :         mask = nelt - 1;
   22057              : 
   22058        59020 :       for (i = 0; i < nelt; ++i)
   22059              :         {
   22060        53108 :           unsigned j, e = d->perm[i] & mask;
   22061       148168 :           for (j = 0; j < eltsz; ++j)
   22062        95060 :             rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
   22063              :         }
   22064              :     }
   22065              : 
   22066         7015 :   machine_mode vpmode = vmode;
   22067              : 
   22068         7015 :   nelt = GET_MODE_SIZE (vmode);
   22069              : 
   22070              :   /* Emulate narrow modes with V16QI instructions.  */
   22071         7015 :   if (nelt < 16)
   22072              :     {
   22073          222 :       rtx m128 = GEN_INT (-128);
   22074              : 
   22075              :       /* Remap elements from the second operand, as we have to
   22076              :          account for inactive top elements from the first operand.  */
   22077          222 :       if (!d->one_operand_p)
   22078              :         {
   22079          243 :           for (i = 0; i < nelt; ++i)
   22080              :             {
   22081          216 :               unsigned ival = UINTVAL (rperm[i]);
   22082          216 :               if (ival >= nelt)
   22083          108 :                 rperm[i] = GEN_INT (ival + 16 - nelt);
   22084              :             }
   22085              :         }
   22086              : 
   22087              :       /* Fill inactive elements in the top positions with zeros.  */
   22088         2570 :       for (i = nelt; i < 16; ++i)
   22089         2348 :         rperm[i] = m128;
   22090              : 
   22091              :       vpmode = V16QImode;
   22092              :     }
   22093              : 
   22094        14030 :   vperm = gen_rtx_CONST_VECTOR (vpmode,
   22095         7015 :                                 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
   22096         7015 :   vperm = force_reg (vpmode, vperm);
   22097              : 
   22098         7015 :   if (vmode == d->vmode)
   22099         2422 :     target = d->target;
   22100              :   else
   22101         4593 :     target = gen_reg_rtx (vmode);
   22102              : 
   22103         7015 :   op0 = gen_lowpart (vmode, d->op0);
   22104              : 
   22105         7015 :   if (d->one_operand_p)
   22106              :     {
   22107         3803 :       rtx (*gen) (rtx, rtx, rtx);
   22108              : 
   22109         3803 :       if (vmode == V4QImode)
   22110              :         gen = gen_mmx_pshufbv4qi3;
   22111              :       else if (vmode == V8QImode)
   22112              :         gen = gen_mmx_pshufbv8qi3;
   22113              :       else if (vmode == V16QImode)
   22114              :         gen = gen_ssse3_pshufbv16qi3;
   22115              :       else if (vmode == V32QImode)
   22116              :         gen = gen_avx2_pshufbv32qi3;
   22117              :       else if (vmode == V64QImode)
   22118              :         gen = gen_avx512bw_pshufbv64qi3;
   22119              :       else if (vmode == V8SFmode)
   22120              :         gen = gen_avx2_permvarv8sf;
   22121              :       else if (vmode == V8SImode)
   22122              :         gen = gen_avx2_permvarv8si;
   22123              :       else if (vmode == V16SFmode)
   22124              :         gen = gen_avx512f_permvarv16sf;
   22125              :       else if (vmode == V16SImode)
   22126              :         gen = gen_avx512f_permvarv16si;
   22127              :       else
   22128              :         gcc_unreachable ();
   22129              : 
   22130         3803 :       emit_insn (gen (target, op0, vperm));
   22131              :     }
   22132              :   else
   22133              :     {
   22134         3212 :       rtx (*gen) (rtx, rtx, rtx, rtx);
   22135              : 
   22136         3212 :       op1 = gen_lowpart (vmode, d->op1);
   22137              : 
   22138         3212 :       if (vmode == V4QImode)
   22139              :         gen = gen_mmx_ppermv32;
   22140              :       else if (vmode == V8QImode)
   22141              :         gen = gen_mmx_ppermv64;
   22142              :       else if (vmode == V16QImode)
   22143              :         gen = gen_xop_pperm;
   22144              :       else
   22145            0 :         gcc_unreachable ();
   22146              : 
   22147         3212 :       emit_insn (gen (target, op0, op1, vperm));
   22148              :     }
   22149              : 
   22150         7015 :   if (target != d->target)
   22151         4593 :     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   22152              : 
   22153              :   return true;
   22154              : }
   22155              : 
   22156              : /* Try to expand one-operand permutation with constant mask.  */
   22157              : 
   22158              : static bool
   22159       123341 : ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
   22160              : {
   22161       123341 :   machine_mode mode = GET_MODE (d->op0);
   22162       123341 :   machine_mode maskmode = mode;
   22163       246682 :   unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
   22164       123341 :   rtx (*gen) (rtx, rtx, rtx) = NULL;
   22165       123341 :   rtx target, op0, mask;
   22166       123341 :   rtx vec[64];
   22167              : 
   22168       123341 :   if (!rtx_equal_p (d->op0, d->op1))
   22169              :     return false;
   22170              : 
   22171        17510 :   if (!TARGET_AVX512F)
   22172              :     return false;
   22173              : 
   22174              :   /* Accept VNxHImode and VNxQImode now.  */
   22175          719 :   if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
   22176              :     return false;
   22177              : 
   22178              :   /* vpermw.  */
   22179          453 :   if (!TARGET_AVX512BW && inner_size == 2)
   22180              :     return false;
   22181              : 
   22182              :   /* vpermb.  */
   22183          319 :   if (!TARGET_AVX512VBMI && inner_size == 1)
   22184              :     return false;
   22185              : 
   22186          200 :   switch (mode)
   22187              :     {
   22188              :     case E_V16SImode:
   22189              :       gen = gen_avx512f_permvarv16si;
   22190              :       break;
   22191            4 :     case E_V16SFmode:
   22192            4 :       gen = gen_avx512f_permvarv16sf;
   22193            4 :       maskmode = V16SImode;
   22194            4 :       break;
   22195            1 :     case E_V8DImode:
   22196            1 :       gen = gen_avx512f_permvarv8di;
   22197            1 :       break;
   22198           30 :     case E_V8DFmode:
   22199           30 :       gen = gen_avx512f_permvarv8df;
   22200           30 :       maskmode = V8DImode;
   22201           30 :       break;
   22202          106 :     case E_V32HImode:
   22203          106 :       gen = gen_avx512bw_permvarv32hi;
   22204          106 :       break;
   22205           14 :     case E_V16HImode:
   22206           14 :       gen = gen_avx512vl_permvarv16hi;
   22207           14 :       break;
   22208            6 :     case E_V8HImode:
   22209            6 :       gen = gen_avx512vl_permvarv8hi;
   22210            6 :       break;
   22211            4 :     case E_V64QImode:
   22212            4 :       gen = gen_avx512bw_permvarv64qi;
   22213            4 :       break;
   22214            2 :     case E_V32QImode:
   22215            2 :       gen = gen_avx512vl_permvarv32qi;
   22216            2 :       break;
   22217            0 :     case E_V16QImode:
   22218            0 :       gen = gen_avx512vl_permvarv16qi;
   22219            0 :       break;
   22220              : 
   22221              :     default:
   22222              :       return false;
   22223              :     }
   22224              : 
   22225          199 :   if (d->testing_p)
   22226              :     return true;
   22227              : 
   22228          190 :   target = d->target;
   22229          190 :   op0 = d->op0;
   22230         4854 :   for (int i = 0; i < d->nelt; ++i)
   22231         4664 :     vec[i] = GEN_INT (d->perm[i]);
   22232          190 :   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
   22233          190 :   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
   22234          190 :   return true;
   22235              : }
   22236              : 
   22237              : static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
   22238              : 
   22239              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
   22240              :    in a single instruction.  */
   22241              : 
   22242              : static bool
   22243       378583 : expand_vec_perm_1 (struct expand_vec_perm_d *d)
   22244              : {
   22245       378583 :   unsigned i, nelt = d->nelt;
   22246       378583 :   struct expand_vec_perm_d nd;
   22247              : 
   22248              :   /* Check plain VEC_SELECT first, because AVX has instructions that could
   22249              :      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
   22250              :      input where SEL+CONCAT may not.  */
   22251       378583 :   if (d->one_operand_p)
   22252              :     {
   22253              :       int mask = nelt - 1;
   22254              :       bool identity_perm = true;
   22255              :       bool broadcast_perm = true;
   22256              : 
   22257       505443 :       for (i = 0; i < nelt; i++)
   22258              :         {
   22259       443464 :           nd.perm[i] = d->perm[i] & mask;
   22260       443464 :           if (nd.perm[i] != i)
   22261       340007 :             identity_perm = false;
   22262       443464 :           if (nd.perm[i])
   22263       365782 :             broadcast_perm = false;
   22264              :         }
   22265              : 
   22266        61979 :       if (identity_perm)
   22267              :         {
   22268           11 :           if (!d->testing_p)
   22269            5 :             emit_move_insn (d->target, d->op0);
   22270           11 :           return true;
   22271              :         }
   22272        61968 :       else if (broadcast_perm && TARGET_AVX2)
   22273              :         {
   22274              :           /* Use vpbroadcast{b,w,d}.  */
   22275          397 :           rtx (*gen) (rtx, rtx) = NULL;
   22276          397 :           switch (d->vmode)
   22277              :             {
   22278            1 :             case E_V64QImode:
   22279            1 :               if (TARGET_AVX512BW)
   22280              :                 gen = gen_avx512bw_vec_dupv64qi_1;
   22281              :               break;
   22282            4 :             case E_V32QImode:
   22283            4 :               gen = gen_avx2_pbroadcastv32qi_1;
   22284            4 :               break;
   22285            1 :             case E_V32HImode:
   22286            1 :               if (TARGET_AVX512BW)
   22287              :                 gen = gen_avx512bw_vec_dupv32hi_1;
   22288              :               break;
   22289            4 :             case E_V16HImode:
   22290            4 :               gen = gen_avx2_pbroadcastv16hi_1;
   22291            4 :               break;
   22292            1 :             case E_V16SImode:
   22293            1 :               if (TARGET_AVX512F)
   22294              :                 gen = gen_avx512f_vec_dupv16si_1;
   22295              :               break;
   22296            4 :             case E_V8SImode:
   22297            4 :               gen = gen_avx2_pbroadcastv8si_1;
   22298            4 :               break;
   22299            4 :             case E_V16QImode:
   22300            4 :               gen = gen_avx2_pbroadcastv16qi;
   22301            4 :               break;
   22302            5 :             case E_V8HImode:
   22303            5 :               gen = gen_avx2_pbroadcastv8hi;
   22304            5 :               break;
   22305            0 :             case E_V16SFmode:
   22306            0 :               if (TARGET_AVX512F)
   22307              :                 gen = gen_avx512f_vec_dupv16sf_1;
   22308              :               break;
   22309              :             case E_V8SFmode:
   22310              :               gen = gen_avx2_vec_dupv8sf_1;
   22311              :               break;
   22312            0 :             case E_V8DFmode:
   22313            0 :               if (TARGET_AVX512F)
   22314              :                 gen = gen_avx512f_vec_dupv8df_1;
   22315              :               break;
   22316            0 :             case E_V8DImode:
   22317            0 :               if (TARGET_AVX512F)
   22318              :                 gen = gen_avx512f_vec_dupv8di_1;
   22319              :               break;
   22320              :             /* For other modes prefer other shuffles this function creates.  */
   22321              :             default: break;
   22322              :             }
   22323           21 :           if (gen != NULL)
   22324              :             {
   22325           24 :               if (!d->testing_p)
   22326           24 :                 emit_insn (gen (d->target, d->op0));
   22327           24 :               return true;
   22328              :             }
   22329              :         }
   22330              : 
   22331        61944 :       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
   22332              :         return true;
   22333              : 
   22334              :       /* There are plenty of patterns in sse.md that are written for
   22335              :          SEL+CONCAT and are not replicated for a single op.  Perhaps
   22336              :          that should be changed, to avoid the nastiness here.  */
   22337              : 
   22338              :       /* Recognize interleave style patterns, which means incrementing
   22339              :          every other permutation operand.  */
   22340       199619 :       for (i = 0; i < nelt; i += 2)
   22341              :         {
   22342       163300 :           nd.perm[i] = d->perm[i] & mask;
   22343       163300 :           nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
   22344              :         }
   22345        36319 :       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
   22346        36319 :                                   d->testing_p))
   22347              :         return true;
   22348              : 
   22349              :       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
   22350        31365 :       if (nelt >= 4)
   22351              :         {
   22352       107157 :           for (i = 0; i < nelt; i += 4)
   22353              :             {
   22354        75792 :               nd.perm[i + 0] = d->perm[i + 0] & mask;
   22355        75792 :               nd.perm[i + 1] = d->perm[i + 1] & mask;
   22356        75792 :               nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
   22357        75792 :               nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
   22358              :             }
   22359              : 
   22360        31365 :           if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
   22361        31365 :                                       d->testing_p))
   22362              :             return true;
   22363              :         }
   22364              :     }
   22365              : 
   22366              :   /* Try the SSE4.1 blend variable merge instructions.  */
   22367       342311 :   if (expand_vec_perm_blend (d))
   22368              :     return true;
   22369              : 
   22370              :   /* Try movss/movsd instructions.  */
   22371       340095 :   if (expand_vec_perm_movs (d))
   22372              :     return true;
   22373              : 
   22374              :   /* Try the SSE4.1 insertps instruction.  */
   22375       284050 :   if (expand_vec_perm_insertps (d))
   22376              :     return true;
   22377              : 
   22378              :   /* Try the fully general two operand permute.  */
   22379       277982 :   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
   22380       277982 :                               d->testing_p))
   22381              :     return true;
   22382              : 
   22383              :   /* Recognize interleave style patterns with reversed operands.  */
   22384       136099 :   if (!d->one_operand_p)
   22385              :     {
   22386       884730 :       for (i = 0; i < nelt; ++i)
   22387              :         {
   22388       774234 :           unsigned e = d->perm[i];
   22389       774234 :           if (e >= nelt)
   22390       380066 :             e -= nelt;
   22391              :           else
   22392       394168 :             e += nelt;
   22393       774234 :           nd.perm[i] = e;
   22394              :         }
   22395              : 
   22396       110496 :       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
   22397       110496 :                                   d->testing_p))
   22398              :         return true;
   22399              :     }
   22400              : 
   22401              :   /* Try one of the AVX vpermil variable permutations.  */
   22402       135971 :   if (expand_vec_perm_vpermil (d))
   22403              :     return true;
   22404              : 
   22405              :   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
   22406              :      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
   22407       135321 :   if (expand_vec_perm_pshufb (d))
   22408              :     return true;
   22409              : 
   22410              :   /* Try the AVX2 vpalignr instruction.  */
   22411       123465 :   if (expand_vec_perm_palignr (d, true))
   22412              :     return true;
   22413              : 
   22414              :   /* Try the AVX512F vperm{w,b,s,d} instructions  */
   22415       123341 :   if (ix86_expand_vec_one_operand_perm_avx512 (d))
   22416              :     return true;
   22417              : 
   22418              :   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
   22419       123142 :   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
   22420              :     return true;
   22421              : 
   22422              :   /* See if we can get the same permutation in different vector integer
   22423              :      mode.  */
   22424       122241 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   22425              :     {
   22426         6654 :       if (!d->testing_p)
   22427         1207 :         emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   22428         6654 :       return true;
   22429              :     }
   22430              :   return false;
   22431              : }
   22432              : 
   22433              : /* Canonicalize vec_perm index to make the first index
   22434              :    always comes from the first vector.  */
   22435              : static void
   22436         8129 : ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
   22437              : {
   22438         8129 :   unsigned nelt = d->nelt;
   22439         8129 :   if (d->perm[0] < nelt)
   22440              :     return;
   22441              : 
   22442            5 :   for (unsigned i = 0; i != nelt; i++)
   22443            4 :     d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
   22444              : 
   22445            1 :   std::swap (d->op0, d->op1);
   22446            1 :   return;
   22447              : }
   22448              : 
   22449              : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
   22450              :    in terms of a pair of shufps+ shufps/pshufd instructions.  */
   22451              : static bool
   22452        84421 : expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
   22453              : {
   22454        84421 :   unsigned char perm1[4];
   22455        84421 :   machine_mode vmode = d->vmode;
   22456        84421 :   bool ok;
   22457        84421 :   unsigned i, j, k, count = 0;
   22458              : 
   22459        84421 :   if (d->one_operand_p
   22460        79111 :       || (vmode != V4SImode && vmode != V4SFmode))
   22461              :     return false;
   22462              : 
   22463        34754 :   if (d->testing_p)
   22464              :     return true;
   22465              : 
   22466         8129 :   ix86_vec_perm_index_canon (d);
   22467        48774 :   for (i = 0; i < 4; ++i)
   22468        50873 :     count += d->perm[i] > 3 ? 1 : 0;
   22469              : 
   22470         8129 :   gcc_assert (count & 3);
   22471              : 
   22472         8129 :   rtx tmp = gen_reg_rtx (vmode);
   22473              :   /* 2 from op0 and 2 from op1.  */
   22474         8129 :   if (count == 2)
   22475              :     {
   22476              :       unsigned char perm2[4];
   22477        18070 :       for (i = 0, j = 0, k = 2; i < 4; ++i)
   22478        14456 :         if (d->perm[i] & 4)
   22479              :           {
   22480         7228 :             perm1[k++] = d->perm[i];
   22481         7228 :             perm2[i] = k - 1;
   22482              :           }
   22483              :         else
   22484              :           {
   22485         7228 :             perm1[j++] = d->perm[i];
   22486         7228 :             perm2[i] = j - 1;
   22487              :           }
   22488              : 
   22489              :       /* shufps.  */
   22490         7228 :       ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
   22491         3614 :                                   perm1, d->nelt, false);
   22492         3614 :       gcc_assert (ok);
   22493         3614 :       if (vmode == V4SImode && TARGET_SSE2)
   22494              :       /* pshufd.  */
   22495         2058 :         ok = expand_vselect (d->target, tmp,
   22496         2058 :                              perm2, d->nelt, false);
   22497              :       else
   22498              :         {
   22499              :           /* shufps.  */
   22500         1556 :           perm2[2] += 4;
   22501         1556 :           perm2[3] += 4;
   22502         1556 :           ok = expand_vselect_vconcat (d->target, tmp, tmp,
   22503         1556 :                                        perm2, d->nelt, false);
   22504              :         }
   22505         3614 :       gcc_assert (ok);
   22506              :     }
   22507              :   /* 3 from one op and 1 from another.  */
   22508              :   else
   22509              :     {
   22510        22575 :       unsigned pair_idx = 8, lone_idx = 8, shift;
   22511              : 
   22512              :       /* Find the lone index.  */
   22513        22575 :       for (i = 0; i < 4; ++i)
   22514        18060 :         if ((d->perm[i] > 3 && count == 1)
   22515        14753 :             || (d->perm[i] < 4 && count == 3))
   22516        18060 :           lone_idx = i;
   22517              : 
   22518              :       /* When lone_idx is not 0, it must from second op(count == 1).  */
   22519         5723 :       gcc_assert (count == (lone_idx ? 1 : 3));
   22520              : 
   22521              :       /* Find the pair index that sits in the same half as the lone index.  */
   22522         4515 :       shift = lone_idx & 2;
   22523         4515 :       pair_idx = 1 - lone_idx + 2 * shift;
   22524              : 
   22525              :       /* First permutate lone index and pair index into the same vector as
   22526              :          [ lone, lone, pair, pair ].  */
   22527         9030 :       perm1[1] = perm1[0]
   22528         4515 :         = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
   22529         9030 :       perm1[3] = perm1[2]
   22530         4515 :         = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
   22531              : 
   22532              :       /* Alway put the vector contains lone indx at the first.  */
   22533         4515 :       if (count == 1)
   22534         3307 :         std::swap (d->op0, d->op1);
   22535              : 
   22536              :       /* shufps.  */
   22537         9030 :       ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
   22538         4515 :                                    perm1, d->nelt, false);
   22539         4515 :       gcc_assert (ok);
   22540              : 
   22541              :       /* Refine lone and pair index to original order.  */
   22542         4515 :       perm1[shift] = lone_idx << 1;
   22543         4515 :       perm1[shift + 1] = pair_idx << 1;
   22544              : 
   22545              :       /* Select the remaining 2 elements in another vector.  */
   22546        13545 :       for (i = 2 - shift; i < 4 - shift; ++i)
   22547         9030 :         perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
   22548              : 
   22549              :       /* Adjust to original selector.  */
   22550         4515 :       if (lone_idx > 1)
   22551         2233 :         std::swap (tmp, d->op1);
   22552              : 
   22553              :       /* shufps.  */
   22554         9030 :       ok = expand_vselect_vconcat (d->target, tmp, d->op1,
   22555         4515 :                                    perm1, d->nelt, false);
   22556              : 
   22557         4515 :       gcc_assert (ok);
   22558              :     }
   22559              : 
   22560              :   return true;
   22561              : }
   22562              : 
   22563              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   22564              :    in terms of a pair of pshuflw + pshufhw instructions.  */
   22565              : 
   22566              : static bool
   22567       101362 : expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
   22568              : {
   22569       101362 :   unsigned char perm2[MAX_VECT_LEN];
   22570       101362 :   unsigned i;
   22571       101362 :   bool ok;
   22572              : 
   22573       101362 :   if (d->vmode != V8HImode || !d->one_operand_p)
   22574              :     return false;
   22575              : 
   22576              :   /* The two permutations only operate in 64-bit lanes.  */
   22577        12859 :   for (i = 0; i < 4; ++i)
   22578        10382 :     if (d->perm[i] >= 4)
   22579              :       return false;
   22580        12329 :   for (i = 4; i < 8; ++i)
   22581         9866 :     if (d->perm[i] < 4)
   22582              :       return false;
   22583              : 
   22584         2463 :   if (d->testing_p)
   22585              :     return true;
   22586              : 
   22587              :   /* Emit the pshuflw.  */
   22588          134 :   memcpy (perm2, d->perm, 4);
   22589          670 :   for (i = 4; i < 8; ++i)
   22590          536 :     perm2[i] = i;
   22591          134 :   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
   22592          134 :   gcc_assert (ok);
   22593              : 
   22594              :   /* Emit the pshufhw.  */
   22595          134 :   memcpy (perm2 + 4, d->perm + 4, 4);
   22596          670 :   for (i = 0; i < 4; ++i)
   22597          536 :     perm2[i] = i;
   22598          134 :   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
   22599          134 :   gcc_assert (ok);
   22600              : 
   22601              :   return true;
   22602              : }
   22603              : 
   22604              : /* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle.  */
   22605              : static bool
   22606        49667 : expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
   22607              : {
   22608        49667 :   if (GET_MODE_BITSIZE (d->vmode) != 64
   22609        15989 :       || !TARGET_MMX_WITH_SSE
   22610        65656 :       || d->one_operand_p)
   22611              :     return false;
   22612              : 
   22613        14579 :   machine_mode widen_vmode;
   22614        14579 :   switch (d->vmode)
   22615              :     {
   22616              :     /* pshufd.  */
   22617              :     case E_V2SImode:
   22618              :       widen_vmode = V4SImode;
   22619              :       break;
   22620              : 
   22621              :     /* pshufd.  */
   22622         1122 :     case E_V2SFmode:
   22623         1122 :       widen_vmode = V4SFmode;
   22624         1122 :       break;
   22625              : 
   22626         4979 :     case E_V4HImode:
   22627         4979 :       widen_vmode = V8HImode;
   22628              :       /* pshufb.  */
   22629         4979 :       if (!TARGET_SSSE3)
   22630              :         return false;
   22631              :       break;
   22632              : 
   22633         5987 :     case E_V8QImode:
   22634              :       /* pshufb.  */
   22635         5987 :       widen_vmode = V16QImode;
   22636         5987 :       if (!TARGET_SSSE3)
   22637              :         return false;
   22638              :       break;
   22639              : 
   22640              :     default:
   22641              :       return false;
   22642              :     }
   22643              : 
   22644         5682 :   if (d->testing_p)
   22645              :     return true;
   22646              : 
   22647          370 :   struct expand_vec_perm_d dperm;
   22648          370 :   dperm.target = gen_reg_rtx (widen_vmode);
   22649          370 :   rtx op0 = gen_reg_rtx (widen_vmode);
   22650          370 :   emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
   22651          370 :   dperm.op0 = op0;
   22652          370 :   dperm.op1 = op0;
   22653          370 :   dperm.vmode = widen_vmode;
   22654          370 :   unsigned nelt = GET_MODE_NUNITS (widen_vmode);
   22655          370 :   dperm.nelt = nelt;
   22656          370 :   dperm.one_operand_p = true;
   22657          370 :   dperm.testing_p = false;
   22658              : 
   22659         1996 :   for (unsigned i = 0; i != nelt / 2; i++)
   22660              :     {
   22661         1626 :       dperm.perm[i] = d->perm[i];
   22662         1626 :       dperm.perm[i + nelt / 2] = d->perm[i];
   22663              :     }
   22664              : 
   22665          370 :   gcc_assert (expand_vec_perm_1 (&dperm));
   22666          370 :   emit_move_insn (d->target, lowpart_subreg (d->vmode,
   22667              :                                              dperm.target,
   22668              :                                              dperm.vmode));
   22669          370 :   return true;
   22670              : }
   22671              : 
   22672              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22673              :    the permutation using the SSSE3 palignr instruction.  This succeeds
   22674              :    when all of the elements in PERM fit within one vector and we merely
   22675              :    need to shift them down so that a single vector permutation has a
   22676              :    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
   22677              :    the vpalignr instruction itself can perform the requested permutation.  */
   22678              : 
   22679              : static bool
   22680       222364 : expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
   22681              : {
   22682       222364 :   unsigned i, nelt = d->nelt;
   22683       222364 :   unsigned min, max, minswap, maxswap;
   22684       222364 :   bool in_order, ok, swap = false;
   22685       222364 :   rtx shift, target;
   22686       222364 :   struct expand_vec_perm_d dcopy;
   22687              : 
   22688              :   /* Even with AVX, palignr only operates on 128-bit vectors,
   22689              :      in AVX2 palignr operates on both 128-bit lanes.  */
   22690       117576 :   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
   22691       264786 :       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
   22692              :     return false;
   22693              : 
   22694        34464 :   min = 2 * nelt;
   22695        34464 :   max = 0;
   22696        34464 :   minswap = 2 * nelt;
   22697        34464 :   maxswap = 0;
   22698       239124 :   for (i = 0; i < nelt; ++i)
   22699              :     {
   22700       204660 :       unsigned e = d->perm[i];
   22701       204660 :       unsigned eswap = d->perm[i] ^ nelt;
   22702       409320 :       if (GET_MODE_SIZE (d->vmode) == 32)
   22703              :         {
   22704        70000 :           e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
   22705        70000 :           eswap = e ^ (nelt / 2);
   22706              :         }
   22707       204660 :       if (e < min)
   22708              :         min = e;
   22709       204660 :       if (e > max)
   22710              :         max = e;
   22711       204660 :       if (eswap < minswap)
   22712              :         minswap = eswap;
   22713       204660 :       if (eswap > maxswap)
   22714              :         maxswap = eswap;
   22715              :     }
   22716        34464 :   if (min == 0
   22717        50381 :       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
   22718              :     {
   22719        31264 :       if (d->one_operand_p
   22720        30995 :           || minswap == 0
   22721        66970 :           || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
   22722        17853 :                                    ? nelt / 2 : nelt))
   22723              :         return false;
   22724              :       swap = true;
   22725              :       min = minswap;
   22726         6420 :       max = maxswap;
   22727              :     }
   22728              : 
   22729              :   /* Given that we have SSSE3, we know we'll be able to implement the
   22730              :      single operand permutation after the palignr with pshufb for
   22731              :      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
   22732              :      first.  */
   22733         6474 :   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
   22734              :     return true;
   22735              : 
   22736         6420 :   dcopy = *d;
   22737         6420 :   if (swap)
   22738              :     {
   22739         3220 :       dcopy.op0 = d->op1;
   22740         3220 :       dcopy.op1 = d->op0;
   22741        16172 :       for (i = 0; i < nelt; ++i)
   22742        12952 :         dcopy.perm[i] ^= nelt;
   22743              :     }
   22744              : 
   22745              :   in_order = true;
   22746        32668 :   for (i = 0; i < nelt; ++i)
   22747              :     {
   22748        26248 :       unsigned e = dcopy.perm[i];
   22749        26248 :       if (GET_MODE_SIZE (d->vmode) == 32
   22750         1152 :           && e >= nelt
   22751        26510 :           && (e & (nelt / 2 - 1)) < min)
   22752          262 :         e = e - min - (nelt / 2);
   22753              :       else
   22754        25986 :         e = e - min;
   22755        26248 :       if (e != i)
   22756        19394 :         in_order = false;
   22757        26248 :       dcopy.perm[i] = e;
   22758              :     }
   22759         6420 :   dcopy.one_operand_p = true;
   22760              : 
   22761         6420 :   if (single_insn_only_p && !in_order)
   22762              :     return false;
   22763              : 
   22764              :   /* For AVX2, test whether we can permute the result in one instruction.  */
   22765         3271 :   if (d->testing_p)
   22766              :     {
   22767           54 :       if (in_order)
   22768              :         return true;
   22769            0 :       dcopy.op1 = dcopy.op0;
   22770            0 :       return expand_vec_perm_1 (&dcopy);
   22771              :     }
   22772              : 
   22773         6434 :   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
   22774         6434 :   if (GET_MODE_SIZE (d->vmode) == 16)
   22775              :     {
   22776         3145 :       target = gen_reg_rtx (V1TImode);
   22777         3145 :       emit_insn (gen_ssse3_palignrv1ti (target,
   22778         3145 :                                         gen_lowpart (V1TImode, dcopy.op1),
   22779         3145 :                                         gen_lowpart (V1TImode, dcopy.op0),
   22780              :                                         shift));
   22781              :     }
   22782              :   else
   22783              :     {
   22784           72 :       target = gen_reg_rtx (V2TImode);
   22785           72 :       emit_insn (gen_avx2_palignrv2ti (target,
   22786           72 :                                        gen_lowpart (V2TImode, dcopy.op1),
   22787           72 :                                        gen_lowpart (V2TImode, dcopy.op0),
   22788              :                                        shift));
   22789              :     }
   22790              : 
   22791         3217 :   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   22792              : 
   22793              :   /* Test for the degenerate case where the alignment by itself
   22794              :      produces the desired permutation.  */
   22795         3217 :   if (in_order)
   22796              :     {
   22797           70 :       emit_move_insn (d->target, dcopy.op0);
   22798           70 :       return true;
   22799              :     }
   22800              : 
   22801         3147 :   ok = expand_vec_perm_1 (&dcopy);
   22802         3159 :   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
   22803              : 
   22804              :   return ok;
   22805              : }
   22806              : 
   22807              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22808              :    the permutation using the SSE4_1 pblendv instruction.  Potentially
   22809              :    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
   22810              : 
   22811              : static bool
   22812        89327 : expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
   22813              : {
   22814        89327 :   unsigned i, which, nelt = d->nelt;
   22815        89327 :   struct expand_vec_perm_d dcopy, dcopy1;
   22816        89327 :   machine_mode vmode = d->vmode;
   22817        89327 :   bool ok;
   22818              : 
   22819              :   /* Use the same checks as in expand_vec_perm_blend.  */
   22820        89327 :   if (d->one_operand_p)
   22821              :     return false;
   22822        87867 :   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   22823              :     ;
   22824        82269 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   22825              :     ;
   22826        78994 :   else if (TARGET_SSE4_1
   22827        89582 :            && (GET_MODE_SIZE (vmode) == 16
   22828         9228 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   22829         2733 :                || GET_MODE_SIZE (vmode) == 4))
   22830              :     ;
   22831              :   else
   22832              :     return false;
   22833              : 
   22834              :   /* Figure out where permutation elements stay not in their
   22835              :      respective lanes.  */
   22836       108231 :   for (i = 0, which = 0; i < nelt; ++i)
   22837              :     {
   22838        92964 :       unsigned e = d->perm[i];
   22839        92964 :       if (e != i)
   22840       127952 :         which |= (e < nelt ? 1 : 2);
   22841              :     }
   22842              :   /* We can pblend the part where elements stay not in their
   22843              :      respective lanes only when these elements are all in one
   22844              :      half of a permutation.
   22845              :      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
   22846              :      lanes, but both 8 and 9 >= 8
   22847              :      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
   22848              :      respective lanes and 8 >= 8, but 2 not.  */
   22849        15267 :   if (which != 1 && which != 2)
   22850              :     return false;
   22851         3175 :   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
   22852              :     return true;
   22853              : 
   22854              :   /* First we apply one operand permutation to the part where
   22855              :      elements stay not in their respective lanes.  */
   22856         1958 :   dcopy = *d;
   22857         1958 :   if (which == 2)
   22858         1958 :     dcopy.op0 = dcopy.op1 = d->op1;
   22859              :   else
   22860            0 :     dcopy.op0 = dcopy.op1 = d->op0;
   22861         1958 :   if (!d->testing_p)
   22862          741 :     dcopy.target = gen_reg_rtx (vmode);
   22863         1958 :   dcopy.one_operand_p = true;
   22864              : 
   22865        15762 :   for (i = 0; i < nelt; ++i)
   22866        13804 :     dcopy.perm[i] = d->perm[i] & (nelt - 1);
   22867              : 
   22868         1958 :   ok = expand_vec_perm_1 (&dcopy);
   22869         3916 :   if (GET_MODE_SIZE (vmode) != 16 && !ok)
   22870              :     return false;
   22871              :   else
   22872         1663 :     gcc_assert (ok);
   22873         1663 :   if (d->testing_p)
   22874              :     return true;
   22875              : 
   22876              :   /* Next we put permuted elements into their positions.  */
   22877          679 :   dcopy1 = *d;
   22878          679 :   if (which == 2)
   22879          679 :     dcopy1.op1 = dcopy.target;
   22880              :   else
   22881            0 :     dcopy1.op0 = dcopy.target;
   22882              : 
   22883         5751 :   for (i = 0; i < nelt; ++i)
   22884         5072 :     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
   22885              : 
   22886          679 :   ok = expand_vec_perm_blend (&dcopy1);
   22887          679 :   gcc_assert (ok);
   22888              : 
   22889              :   return true;
   22890              : }
   22891              : 
   22892              : static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
   22893              : 
   22894              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22895              :    a two vector permutation into a single vector permutation by using
   22896              :    an interleave operation to merge the vectors.  */
   22897              : 
   22898              : static bool
   22899        95764 : expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
   22900              : {
   22901        95764 :   struct expand_vec_perm_d dremap, dfinal;
   22902        95764 :   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
   22903        95764 :   unsigned HOST_WIDE_INT contents;
   22904        95764 :   unsigned char remap[2 * MAX_VECT_LEN];
   22905        95764 :   rtx_insn *seq;
   22906        95764 :   bool ok, same_halves = false;
   22907              : 
   22908        95764 :   if (GET_MODE_SIZE (d->vmode) == 4
   22909       171582 :       || GET_MODE_SIZE (d->vmode) == 8
   22910       231626 :       || GET_MODE_SIZE (d->vmode) == 16)
   22911              :     {
   22912        89367 :       if (d->one_operand_p)
   22913              :         return false;
   22914              :     }
   22915        12794 :   else if (GET_MODE_SIZE (d->vmode) == 32)
   22916              :     {
   22917         6072 :       if (!TARGET_AVX)
   22918              :         return false;
   22919              :       /* For 32-byte modes allow even d->one_operand_p.
   22920              :          The lack of cross-lane shuffling in some instructions
   22921              :          might prevent a single insn shuffle.  */
   22922         6072 :       dfinal = *d;
   22923         6072 :       dfinal.testing_p = true;
   22924              :       /* If expand_vec_perm_interleave3 can expand this into
   22925              :          a 3 insn sequence, give up and let it be expanded as
   22926              :          3 insn sequence.  While that is one insn longer,
   22927              :          it doesn't need a memory operand and in the common
   22928              :          case that both interleave low and high permutations
   22929              :          with the same operands are adjacent needs 4 insns
   22930              :          for both after CSE.  */
   22931         6072 :       if (expand_vec_perm_interleave3 (&dfinal))
   22932              :         return false;
   22933              :     }
   22934              :   else
   22935              :     return false;
   22936              : 
   22937              :   /* Examine from whence the elements come.  */
   22938        89991 :   contents = 0;
   22939       675225 :   for (i = 0; i < nelt; ++i)
   22940       585234 :     contents |= HOST_WIDE_INT_1U << d->perm[i];
   22941              : 
   22942        89991 :   memset (remap, 0xff, sizeof (remap));
   22943        89991 :   dremap = *d;
   22944              : 
   22945        89991 :   if (GET_MODE_SIZE (d->vmode) == 4
   22946       171689 :       || GET_MODE_SIZE (d->vmode) == 8)
   22947              :     {
   22948        24739 :       unsigned HOST_WIDE_INT h1, h2, h3, h4;
   22949              : 
   22950              :       /* Split the two input vectors into 4 halves.  */
   22951        24739 :       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
   22952        24739 :       h2 = h1 << nelt2;
   22953        24739 :       h3 = h2 << nelt2;
   22954        24739 :       h4 = h3 << nelt2;
   22955              : 
   22956              :       /* If the elements from the low halves use interleave low,
   22957              :          and similarly for interleave high.  */
   22958        24739 :       if ((contents & (h1 | h3)) == contents)
   22959              :         {
   22960              :           /* punpckl* */
   22961         3348 :           for (i = 0; i < nelt2; ++i)
   22962              :             {
   22963         2360 :               remap[i] = i * 2;
   22964         2360 :               remap[i + nelt] = i * 2 + 1;
   22965         2360 :               dremap.perm[i * 2] = i;
   22966         2360 :               dremap.perm[i * 2 + 1] = i + nelt;
   22967              :             }
   22968              :         }
   22969        23751 :       else if ((contents & (h2 | h4)) == contents)
   22970              :         {
   22971              :           /* punpckh* */
   22972         2877 :           for (i = 0; i < nelt2; ++i)
   22973              :             {
   22974         2028 :               remap[i + nelt2] = i * 2;
   22975         2028 :               remap[i + nelt + nelt2] = i * 2 + 1;
   22976         2028 :               dremap.perm[i * 2] = i + nelt2;
   22977         2028 :               dremap.perm[i * 2 + 1] = i + nelt + nelt2;
   22978              :             }
   22979              :         }
   22980              :       else
   22981              :         return false;
   22982              :     }
   22983       130504 :   else if (GET_MODE_SIZE (d->vmode) == 16)
   22984              :     {
   22985        59398 :       unsigned HOST_WIDE_INT h1, h2, h3, h4;
   22986              : 
   22987              :       /* Split the two input vectors into 4 halves.  */
   22988        59398 :       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
   22989        59398 :       h2 = h1 << nelt2;
   22990        59398 :       h3 = h2 << nelt2;
   22991        59398 :       h4 = h3 << nelt2;
   22992              : 
   22993              :       /* If the elements from the low halves use interleave low, and similarly
   22994              :          for interleave high.  If the elements are from mis-matched halves, we
   22995              :          can use shufps for V4SF/V4SI or do a DImode shuffle.  */
   22996        59398 :       if ((contents & (h1 | h3)) == contents)
   22997              :         {
   22998              :           /* punpckl* */
   22999         5779 :           for (i = 0; i < nelt2; ++i)
   23000              :             {
   23001         4286 :               remap[i] = i * 2;
   23002         4286 :               remap[i + nelt] = i * 2 + 1;
   23003         4286 :               dremap.perm[i * 2] = i;
   23004         4286 :               dremap.perm[i * 2 + 1] = i + nelt;
   23005              :             }
   23006         1493 :           if (!TARGET_SSE2 && d->vmode == V4SImode)
   23007            0 :             dremap.vmode = V4SFmode;
   23008              :         }
   23009        57905 :       else if ((contents & (h2 | h4)) == contents)
   23010              :         {
   23011              :           /* punpckh* */
   23012         4986 :           for (i = 0; i < nelt2; ++i)
   23013              :             {
   23014         3666 :               remap[i + nelt2] = i * 2;
   23015         3666 :               remap[i + nelt + nelt2] = i * 2 + 1;
   23016         3666 :               dremap.perm[i * 2] = i + nelt2;
   23017         3666 :               dremap.perm[i * 2 + 1] = i + nelt + nelt2;
   23018              :             }
   23019         1320 :           if (!TARGET_SSE2 && d->vmode == V4SImode)
   23020            0 :             dremap.vmode = V4SFmode;
   23021              :         }
   23022        56585 :       else if ((contents & (h1 | h4)) == contents)
   23023              :         {
   23024              :           /* shufps */
   23025         2537 :           for (i = 0; i < nelt2; ++i)
   23026              :             {
   23027         1828 :               remap[i] = i;
   23028         1828 :               remap[i + nelt + nelt2] = i + nelt2;
   23029         1828 :               dremap.perm[i] = i;
   23030         1828 :               dremap.perm[i + nelt2] = i + nelt + nelt2;
   23031              :             }
   23032          709 :           if (nelt != 4)
   23033              :             {
   23034              :               /* shufpd */
   23035           69 :               dremap.vmode = V2DImode;
   23036           69 :               dremap.nelt = 2;
   23037           69 :               dremap.perm[0] = 0;
   23038           69 :               dremap.perm[1] = 3;
   23039              :             }
   23040              :         }
   23041        55876 :       else if ((contents & (h2 | h3)) == contents)
   23042              :         {
   23043              :           /* shufps */
   23044         3483 :           for (i = 0; i < nelt2; ++i)
   23045              :             {
   23046         2458 :               remap[i + nelt2] = i;
   23047         2458 :               remap[i + nelt] = i + nelt2;
   23048         2458 :               dremap.perm[i] = i + nelt2;
   23049         2458 :               dremap.perm[i + nelt2] = i + nelt;
   23050              :             }
   23051         1025 :           if (nelt != 4)
   23052              :             {
   23053              :               /* shufpd */
   23054           76 :               dremap.vmode = V2DImode;
   23055           76 :               dremap.nelt = 2;
   23056           76 :               dremap.perm[0] = 1;
   23057           76 :               dremap.perm[1] = 2;
   23058              :             }
   23059              :         }
   23060              :       else
   23061              :         return false;
   23062              :     }
   23063              :   else
   23064              :     {
   23065         5854 :       unsigned int nelt4 = nelt / 4, nzcnt = 0;
   23066         5854 :       unsigned HOST_WIDE_INT q[8];
   23067         5854 :       unsigned int nonzero_halves[4];
   23068              : 
   23069              :       /* Split the two input vectors into 8 quarters.  */
   23070         5854 :       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
   23071        46832 :       for (i = 1; i < 8; ++i)
   23072        40978 :         q[i] = q[0] << (nelt4 * i);
   23073        29270 :       for (i = 0; i < 4; ++i)
   23074        23416 :         if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
   23075              :           {
   23076        21169 :             nonzero_halves[nzcnt] = i;
   23077        21169 :             ++nzcnt;
   23078              :           }
   23079              : 
   23080         5854 :       if (nzcnt == 1)
   23081              :         {
   23082          215 :           gcc_assert (d->one_operand_p);
   23083          215 :           nonzero_halves[1] = nonzero_halves[0];
   23084          215 :           same_halves = true;
   23085              :         }
   23086         5639 :       else if (d->one_operand_p)
   23087              :         {
   23088           23 :           gcc_assert (nonzero_halves[0] == 0);
   23089           23 :           gcc_assert (nonzero_halves[1] == 1);
   23090              :         }
   23091              : 
   23092         5854 :       if (nzcnt <= 2)
   23093              :         {
   23094          490 :           if (d->perm[0] / nelt2 == nonzero_halves[1])
   23095              :             {
   23096              :               /* Attempt to increase the likelihood that dfinal
   23097              :                  shuffle will be intra-lane.  */
   23098          223 :               std::swap (nonzero_halves[0], nonzero_halves[1]);
   23099              :             }
   23100              : 
   23101              :           /* vperm2f128 or vperm2i128.  */
   23102         3256 :           for (i = 0; i < nelt2; ++i)
   23103              :             {
   23104         2766 :               remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
   23105         2766 :               remap[i + nonzero_halves[0] * nelt2] = i;
   23106         2766 :               dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
   23107         2766 :               dremap.perm[i] = i + nonzero_halves[0] * nelt2;
   23108              :             }
   23109              : 
   23110          490 :           if (d->vmode != V8SFmode
   23111              :               && d->vmode != V4DFmode
   23112              :               && d->vmode != V8SImode)
   23113              :             {
   23114          132 :               dremap.vmode = V8SImode;
   23115          132 :               dremap.nelt = 8;
   23116          660 :               for (i = 0; i < 4; ++i)
   23117              :                 {
   23118          528 :                   dremap.perm[i] = i + nonzero_halves[0] * 4;
   23119          528 :                   dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
   23120              :                 }
   23121              :             }
   23122              :         }
   23123         5364 :       else if (d->one_operand_p)
   23124         4947 :         return false;
   23125         5364 :       else if (TARGET_AVX2
   23126         2125 :                && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
   23127              :         {
   23128              :           /* vpunpckl* */
   23129          443 :           for (i = 0; i < nelt4; ++i)
   23130              :             {
   23131          223 :               remap[i] = i * 2;
   23132          223 :               remap[i + nelt] = i * 2 + 1;
   23133          223 :               remap[i + nelt2] = i * 2 + nelt2;
   23134          223 :               remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
   23135          223 :               dremap.perm[i * 2] = i;
   23136          223 :               dremap.perm[i * 2 + 1] = i + nelt;
   23137          223 :               dremap.perm[i * 2 + nelt2] = i + nelt2;
   23138          223 :               dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
   23139              :             }
   23140              :         }
   23141         5144 :       else if (TARGET_AVX2
   23142         1905 :                && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
   23143              :         {
   23144              :           /* vpunpckh* */
   23145          397 :           for (i = 0; i < nelt4; ++i)
   23146              :             {
   23147          200 :               remap[i + nelt4] = i * 2;
   23148          200 :               remap[i + nelt + nelt4] = i * 2 + 1;
   23149          200 :               remap[i + nelt2 + nelt4] = i * 2 + nelt2;
   23150          200 :               remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
   23151          200 :               dremap.perm[i * 2] = i + nelt4;
   23152          200 :               dremap.perm[i * 2 + 1] = i + nelt + nelt4;
   23153          200 :               dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
   23154          200 :               dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
   23155              :             }
   23156              :         }
   23157              :       else
   23158              :         return false;
   23159              :     }
   23160              : 
   23161              :   /* Use the remapping array set up above to move the elements from their
   23162              :      swizzled locations into their final destinations.  */
   23163         7291 :   dfinal = *d;
   23164        47767 :   for (i = 0; i < nelt; ++i)
   23165              :     {
   23166        40476 :       unsigned e = remap[d->perm[i]];
   23167        40476 :       gcc_assert (e < nelt);
   23168              :       /* If same_halves is true, both halves of the remapped vector are the
   23169              :          same.  Avoid cross-lane accesses if possible.  */
   23170        40476 :       if (same_halves && i >= nelt2)
   23171              :         {
   23172          792 :           gcc_assert (e < nelt2);
   23173          792 :           dfinal.perm[i] = e + nelt2;
   23174              :         }
   23175              :       else
   23176        39684 :         dfinal.perm[i] = e;
   23177              :     }
   23178         7291 :   if (!d->testing_p)
   23179              :     {
   23180         2703 :       dremap.target = gen_reg_rtx (dremap.vmode);
   23181         2703 :       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   23182              :     }
   23183         7291 :   dfinal.op1 = dfinal.op0;
   23184         7291 :   dfinal.one_operand_p = true;
   23185              : 
   23186              :   /* Test if the final remap can be done with a single insn.  For V4SFmode or
   23187              :      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
   23188         7291 :   start_sequence ();
   23189         7291 :   ok = expand_vec_perm_1 (&dfinal);
   23190         7291 :   seq = end_sequence ();
   23191              : 
   23192         7291 :   if (!ok)
   23193              :     return false;
   23194              : 
   23195         6269 :   if (d->testing_p)
   23196              :     return true;
   23197              : 
   23198         2664 :   if (dremap.vmode != dfinal.vmode)
   23199              :     {
   23200           55 :       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
   23201           55 :       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
   23202              :     }
   23203              : 
   23204         2664 :   ok = expand_vec_perm_1 (&dremap);
   23205         2664 :   gcc_assert (ok);
   23206              : 
   23207         2664 :   emit_insn (seq);
   23208         2664 :   return true;
   23209              : }
   23210              : 
   23211              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23212              :    a single vector cross-lane permutation into vpermq followed
   23213              :    by any of the single insn permutations.  */
   23214              : 
   23215              : static bool
   23216        89395 : expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
   23217              : {
   23218        89395 :   struct expand_vec_perm_d dremap, dfinal;
   23219        89395 :   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
   23220        89395 :   unsigned contents[2];
   23221        89395 :   bool ok;
   23222              : 
   23223        89395 :   if (!(TARGET_AVX2
   23224         4080 :         && (d->vmode == V32QImode || d->vmode == V16HImode)
   23225          256 :         && d->one_operand_p))
   23226              :     return false;
   23227              : 
   23228            7 :   contents[0] = 0;
   23229            7 :   contents[1] = 0;
   23230          103 :   for (i = 0; i < nelt2; ++i)
   23231              :     {
   23232           96 :       contents[0] |= 1u << (d->perm[i] / nelt4);
   23233           96 :       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
   23234              :     }
   23235              : 
   23236            7 :   for (i = 0; i < 2; ++i)
   23237              :     {
   23238              :       unsigned int cnt = 0;
   23239           21 :       for (j = 0; j < 4; ++j)
   23240           21 :         if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
   23241              :           return false;
   23242              :     }
   23243              : 
   23244            0 :   if (d->testing_p)
   23245              :     return true;
   23246              : 
   23247            0 :   dremap = *d;
   23248            0 :   dremap.vmode = V4DImode;
   23249            0 :   dremap.nelt = 4;
   23250            0 :   dremap.target = gen_reg_rtx (V4DImode);
   23251            0 :   dremap.op0 = gen_lowpart (V4DImode, d->op0);
   23252            0 :   dremap.op1 = dremap.op0;
   23253            0 :   dremap.one_operand_p = true;
   23254            0 :   for (i = 0; i < 2; ++i)
   23255              :     {
   23256              :       unsigned int cnt = 0;
   23257            0 :       for (j = 0; j < 4; ++j)
   23258            0 :         if ((contents[i] & (1u << j)) != 0)
   23259            0 :           dremap.perm[2 * i + cnt++] = j;
   23260            0 :       for (; cnt < 2; ++cnt)
   23261            0 :         dremap.perm[2 * i + cnt] = 0;
   23262              :     }
   23263              : 
   23264            0 :   dfinal = *d;
   23265            0 :   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   23266            0 :   dfinal.op1 = dfinal.op0;
   23267            0 :   dfinal.one_operand_p = true;
   23268            0 :   for (i = 0, j = 0; i < nelt; ++i)
   23269              :     {
   23270            0 :       if (i == nelt2)
   23271            0 :         j = 2;
   23272            0 :       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
   23273            0 :       if ((d->perm[i] / nelt4) == dremap.perm[j])
   23274              :         ;
   23275            0 :       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
   23276            0 :         dfinal.perm[i] |= nelt4;
   23277              :       else
   23278            0 :         gcc_unreachable ();
   23279              :     }
   23280              : 
   23281            0 :   ok = expand_vec_perm_1 (&dremap);
   23282            0 :   gcc_assert (ok);
   23283              : 
   23284            0 :   ok = expand_vec_perm_1 (&dfinal);
   23285            0 :   gcc_assert (ok);
   23286              : 
   23287              :   return true;
   23288              : }
   23289              : 
   23290              : static bool canonicalize_perm (struct expand_vec_perm_d *d);
   23291              : 
   23292              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
   23293              :    a vector permutation using two instructions, vperm2f128 resp.
   23294              :    vperm2i128 followed by any single in-lane permutation.  */
   23295              : 
   23296              : static bool
   23297        89395 : expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
   23298              : {
   23299        89395 :   struct expand_vec_perm_d dfirst, dsecond;
   23300        89395 :   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
   23301        89395 :   bool ok;
   23302              : 
   23303        89395 :   if (!TARGET_AVX
   23304        22132 :       || GET_MODE_SIZE (d->vmode) != 32
   23305        94685 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
   23306              :     return false;
   23307              : 
   23308         5106 :   dsecond = *d;
   23309         5106 :   dsecond.one_operand_p = false;
   23310         5106 :   dsecond.testing_p = true;
   23311              : 
   23312              :   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
   23313              :      immediate.  For perm < 16 the second permutation uses
   23314              :      d->op0 as first operand, for perm >= 16 it uses d->op1
   23315              :      as first operand.  The second operand is the result of
   23316              :      vperm2[fi]128.  */
   23317       167184 :   for (perm = 0; perm < 32; perm++)
   23318              :     {
   23319              :       /* Ignore permutations which do not move anything cross-lane.  */
   23320       162161 :       if (perm < 16)
   23321              :         {
   23322              :           /* The second shuffle for e.g. V4DFmode has
   23323              :              0123 and ABCD operands.
   23324              :              Ignore AB23, as 23 is already in the second lane
   23325              :              of the first operand.  */
   23326        81334 :           if ((perm & 0xc) == (1 << 2)) continue;
   23327              :           /* And 01CD, as 01 is in the first lane of the first
   23328              :              operand.  */
   23329        60986 :           if ((perm & 3) == 0) continue;
   23330              :           /* And 4567, as then the vperm2[fi]128 doesn't change
   23331              :              anything on the original 4567 second operand.  */
   23332        45722 :           if ((perm & 0xf) == ((3 << 2) | 2)) continue;
   23333              :         }
   23334              :       else
   23335              :         {
   23336              :           /* The second shuffle for e.g. V4DFmode has
   23337              :              4567 and ABCD operands.
   23338              :              Ignore AB67, as 67 is already in the second lane
   23339              :              of the first operand.  */
   23340        80827 :           if ((perm & 0xc) == (3 << 2)) continue;
   23341              :           /* And 45CD, as 45 is in the first lane of the first
   23342              :              operand.  */
   23343        60735 :           if ((perm & 3) == 2) continue;
   23344              :           /* And 0123, as then the vperm2[fi]128 doesn't change
   23345              :              anything on the original 0123 first operand.  */
   23346        45575 :           if ((perm & 0xf) == (1 << 2)) continue;
   23347              :         }
   23348              : 
   23349       210202 :       for (i = 0; i < nelt; i++)
   23350              :         {
   23351       209051 :           j = d->perm[i] / nelt2;
   23352       388758 :           if (j == ((perm >> (2 * (i >= nelt2))) & 3))
   23353        51827 :             dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
   23354       260841 :           else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
   23355        77218 :             dsecond.perm[i] = d->perm[i] & (nelt - 1);
   23356              :           else
   23357              :             break;
   23358              :         }
   23359              : 
   23360        81157 :       if (i == nelt)
   23361              :         {
   23362         1151 :           start_sequence ();
   23363         1151 :           ok = expand_vec_perm_1 (&dsecond);
   23364         1151 :           end_sequence ();
   23365              :         }
   23366              :       else
   23367              :         ok = false;
   23368              : 
   23369         1151 :       if (ok)
   23370              :         {
   23371           68 :           if (d->testing_p)
   23372              :             return true;
   23373              : 
   23374              :           /* Found a usable second shuffle.  dfirst will be
   23375              :              vperm2f128 on d->op0 and d->op1.  */
   23376           46 :           dsecond.testing_p = false;
   23377           46 :           dfirst = *d;
   23378           46 :           dfirst.target = gen_reg_rtx (d->vmode);
   23379          270 :           for (i = 0; i < nelt; i++)
   23380          448 :             dfirst.perm[i] = (i & (nelt2 - 1))
   23381          336 :                              + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
   23382              : 
   23383           46 :           canonicalize_perm (&dfirst);
   23384           46 :           ok = expand_vec_perm_1 (&dfirst);
   23385           46 :           gcc_assert (ok);
   23386              : 
   23387              :           /* And dsecond is some single insn shuffle, taking
   23388              :              d->op0 and result of vperm2f128 (if perm < 16) or
   23389              :              d->op1 and result of vperm2f128 (otherwise).  */
   23390           46 :           if (perm >= 16)
   23391           46 :             dsecond.op0 = dsecond.op1;
   23392           46 :           dsecond.op1 = dfirst.target;
   23393              : 
   23394           46 :           ok = expand_vec_perm_1 (&dsecond);
   23395           46 :           gcc_assert (ok);
   23396              : 
   23397              :           return true;
   23398              :         }
   23399              : 
   23400              :       /* For one operand, the only useful vperm2f128 permutation is 0x01
   23401              :          aka lanes swap.  */
   23402        81089 :       if (d->one_operand_p)
   23403              :         return false;
   23404              :     }
   23405              : 
   23406              :   return false;
   23407              : }
   23408              : 
   23409              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23410              :    a two vector permutation using 2 intra-lane interleave insns
   23411              :    and cross-lane shuffle for 32-byte vectors.  */
   23412              : 
   23413              : static bool
   23414        33409 : expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
   23415              : {
   23416        33409 :   unsigned i, nelt;
   23417        33409 :   rtx (*gen) (rtx, rtx, rtx);
   23418              : 
   23419        33409 :   if (d->one_operand_p)
   23420              :     return false;
   23421        31267 :   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
   23422              :     ;
   23423        24701 :   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
   23424              :     ;
   23425              :   else
   23426              :     return false;
   23427              : 
   23428         8198 :   nelt = d->nelt;
   23429         8198 :   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
   23430              :     return false;
   23431         8555 :   for (i = 0; i < nelt; i += 2)
   23432         8199 :     if (d->perm[i] != d->perm[0] + i / 2
   23433         7326 :         || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
   23434              :       return false;
   23435              : 
   23436          356 :   if (d->testing_p)
   23437              :     return true;
   23438              : 
   23439           56 :   switch (d->vmode)
   23440              :     {
   23441           32 :     case E_V32QImode:
   23442           32 :       if (d->perm[0])
   23443              :         gen = gen_vec_interleave_highv32qi;
   23444              :       else
   23445           16 :         gen = gen_vec_interleave_lowv32qi;
   23446              :       break;
   23447           18 :     case E_V16HImode:
   23448           18 :       if (d->perm[0])
   23449              :         gen = gen_vec_interleave_highv16hi;
   23450              :       else
   23451            9 :         gen = gen_vec_interleave_lowv16hi;
   23452              :       break;
   23453            0 :     case E_V8SImode:
   23454            0 :       if (d->perm[0])
   23455              :         gen = gen_vec_interleave_highv8si;
   23456              :       else
   23457            0 :         gen = gen_vec_interleave_lowv8si;
   23458              :       break;
   23459            4 :     case E_V4DImode:
   23460            4 :       if (d->perm[0])
   23461              :         gen = gen_vec_interleave_highv4di;
   23462              :       else
   23463            2 :         gen = gen_vec_interleave_lowv4di;
   23464              :       break;
   23465            2 :     case E_V8SFmode:
   23466            2 :       if (d->perm[0])
   23467              :         gen = gen_vec_interleave_highv8sf;
   23468              :       else
   23469            1 :         gen = gen_vec_interleave_lowv8sf;
   23470              :       break;
   23471            0 :     case E_V4DFmode:
   23472            0 :       if (d->perm[0])
   23473              :         gen = gen_vec_interleave_highv4df;
   23474              :       else
   23475            0 :         gen = gen_vec_interleave_lowv4df;
   23476              :       break;
   23477            0 :     default:
   23478            0 :       gcc_unreachable ();
   23479              :     }
   23480              : 
   23481           56 :   emit_insn (gen (d->target, d->op0, d->op1));
   23482           56 :   return true;
   23483              : }
   23484              : 
   23485              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23486              :    a single vector permutation using a single intra-lane vector
   23487              :    permutation, vperm2f128 swapping the lanes and vblend* insn blending
   23488              :    the non-swapped and swapped vectors together.  */
   23489              : 
   23490              : static bool
   23491        27199 : expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
   23492              : {
   23493        27199 :   struct expand_vec_perm_d dfirst, dsecond;
   23494        27199 :   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
   23495        27199 :   rtx_insn *seq;
   23496        27199 :   bool ok;
   23497        27199 :   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
   23498              : 
   23499        27199 :   if (!TARGET_AVX
   23500         2933 :       || TARGET_AVX2
   23501         1814 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
   23502         1630 :       || !d->one_operand_p)
   23503              :     return false;
   23504              : 
   23505            0 :   dfirst = *d;
   23506            0 :   for (i = 0; i < nelt; i++)
   23507            0 :     dfirst.perm[i] = 0xff;
   23508            0 :   for (i = 0, msk = 0; i < nelt; i++)
   23509              :     {
   23510            0 :       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
   23511            0 :       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
   23512              :         return false;
   23513            0 :       dfirst.perm[j] = d->perm[i];
   23514            0 :       if (j != i)
   23515            0 :         msk |= (1 << i);
   23516              :     }
   23517            0 :   for (i = 0; i < nelt; i++)
   23518            0 :     if (dfirst.perm[i] == 0xff)
   23519            0 :       dfirst.perm[i] = i;
   23520              : 
   23521            0 :   if (!d->testing_p)
   23522            0 :     dfirst.target = gen_reg_rtx (dfirst.vmode);
   23523              : 
   23524            0 :   start_sequence ();
   23525            0 :   ok = expand_vec_perm_1 (&dfirst);
   23526            0 :   seq = end_sequence ();
   23527              : 
   23528            0 :   if (!ok)
   23529              :     return false;
   23530              : 
   23531            0 :   if (d->testing_p)
   23532              :     return true;
   23533              : 
   23534            0 :   emit_insn (seq);
   23535              : 
   23536            0 :   dsecond = *d;
   23537            0 :   dsecond.op0 = dfirst.target;
   23538            0 :   dsecond.op1 = dfirst.target;
   23539            0 :   dsecond.one_operand_p = true;
   23540            0 :   dsecond.target = gen_reg_rtx (dsecond.vmode);
   23541            0 :   for (i = 0; i < nelt; i++)
   23542            0 :     dsecond.perm[i] = i ^ nelt2;
   23543              : 
   23544            0 :   ok = expand_vec_perm_1 (&dsecond);
   23545            0 :   gcc_assert (ok);
   23546              : 
   23547            0 :   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
   23548            0 :   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
   23549            0 :   return true;
   23550              : }
   23551              : 
   23552              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23553              :    a two vector permutation using two single vector permutations and
   23554              :    {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
   23555              :    of dfirst or dsecond is identity permutation.  */
   23556              : 
   23557              : static bool
   23558       114863 : expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
   23559              : {
   23560       114863 :   unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
   23561       114863 :   struct expand_vec_perm_d dfirst, dsecond, dfinal;
   23562       114863 :   bool ident1 = true, ident2 = true;
   23563              : 
   23564       114863 :   if (d->one_operand_p)
   23565              :     return false;
   23566              : 
   23567       208572 :   if (GET_MODE_SIZE (d->vmode) == 16)
   23568              :     {
   23569        62536 :       if (!TARGET_SSE)
   23570              :         return false;
   23571        62536 :       if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
   23572              :         return false;
   23573              :     }
   23574        83500 :   else if (GET_MODE_SIZE (d->vmode) == 32)
   23575              :     {
   23576         7220 :       if (!TARGET_AVX)
   23577              :         return false;
   23578         7220 :       if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
   23579              :         return false;
   23580              :       lane = nelt2;
   23581              :     }
   23582              :   else
   23583              :     return false;
   23584              : 
   23585       231246 :   for (i = 1; i < nelt; i++)
   23586       198111 :     if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
   23587              :       return false;
   23588              : 
   23589        33135 :   dfirst = *d;
   23590        33135 :   dsecond = *d;
   23591        33135 :   dfinal = *d;
   23592        33135 :   dfirst.op1 = dfirst.op0;
   23593        33135 :   dfirst.one_operand_p = true;
   23594        33135 :   dsecond.op0 = dsecond.op1;
   23595        33135 :   dsecond.one_operand_p = true;
   23596              : 
   23597       218699 :   for (i = 0; i < nelt; i++)
   23598       185564 :     if (d->perm[i] >= nelt)
   23599              :       {
   23600        92782 :         dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
   23601        92782 :         if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
   23602        84306 :           ident2 = false;
   23603        92782 :         dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
   23604        92782 :           = d->perm[i] - nelt;
   23605              :       }
   23606              :     else
   23607              :       {
   23608        92782 :         dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
   23609        92782 :         if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
   23610        75948 :           ident1 = false;
   23611        92782 :         dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
   23612              :       }
   23613              : 
   23614        33135 :   if (two_insn && !ident1 && !ident2)
   23615              :     return false;
   23616              : 
   23617         3957 :   if (!d->testing_p)
   23618              :     {
   23619          214 :       if (!ident1)
   23620          144 :         dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
   23621          214 :       if (!ident2)
   23622          148 :         dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
   23623          214 :       if (d->perm[0] >= nelt)
   23624            0 :         std::swap (dfinal.op0, dfinal.op1);
   23625              :     }
   23626              : 
   23627         3957 :   bool ok;
   23628         3957 :   rtx_insn *seq1 = NULL, *seq2 = NULL;
   23629              : 
   23630         3957 :   if (!ident1)
   23631              :     {
   23632         2645 :       start_sequence ();
   23633         2645 :       ok = expand_vec_perm_1 (&dfirst);
   23634         2645 :       seq1 = end_sequence ();
   23635              : 
   23636         2645 :       if (!ok)
   23637              :         return false;
   23638              :     }
   23639              : 
   23640         2168 :   if (!ident2)
   23641              :     {
   23642         2074 :       start_sequence ();
   23643         2074 :       ok = expand_vec_perm_1 (&dsecond);
   23644         2074 :       seq2 = end_sequence ();
   23645              : 
   23646         2074 :       if (!ok)
   23647              :         return false;
   23648              :     }
   23649              : 
   23650          602 :   if (d->testing_p)
   23651              :     return true;
   23652              : 
   23653          680 :   for (i = 0; i < nelt; i++)
   23654              :     {
   23655          544 :       dfinal.perm[i] = i / 2;
   23656          544 :       if (i >= lane)
   23657            4 :         dfinal.perm[i] += lane / 2;
   23658          544 :       if ((i & 1) != 0)
   23659          272 :         dfinal.perm[i] += nelt;
   23660              :     }
   23661          136 :   emit_insn (seq1);
   23662          136 :   emit_insn (seq2);
   23663          136 :   ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
   23664              :                                dfinal.perm, dfinal.nelt, false);
   23665          136 :   gcc_assert (ok);
   23666              :   return true;
   23667              : }
   23668              : 
   23669              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23670              :    the permutation using two single vector permutations and the SSE4_1 pblendv
   23671              :    instruction.  If two_insn, succeed only if one of dfirst or dsecond is
   23672              :    identity permutation.  */
   23673              : 
   23674              : static bool
   23675       114261 : expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   23676              : {
   23677       114261 :   unsigned i, nelt = d->nelt;
   23678       114261 :   struct expand_vec_perm_d dfirst, dsecond, dfinal;
   23679       114261 :   machine_mode vmode = d->vmode;
   23680       114261 :   bool ident1 = true, ident2 = true;
   23681              : 
   23682              :   /* Use the same checks as in expand_vec_perm_blend.  */
   23683       114261 :   if (d->one_operand_p)
   23684              :     return false;
   23685       107653 :   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   23686              :     ;
   23687       101705 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   23688              :     ;
   23689        97080 :   else if (TARGET_SSE4_1
   23690       106873 :            && (GET_MODE_SIZE (vmode) == 16
   23691         8878 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   23692         2629 :                || GET_MODE_SIZE (vmode) == 4))
   23693              :     ;
   23694              :   else
   23695              :     return false;
   23696              : 
   23697        15709 :   dfirst = *d;
   23698        15709 :   dsecond = *d;
   23699        15709 :   dfinal = *d;
   23700        15709 :   dfirst.op1 = dfirst.op0;
   23701        15709 :   dfirst.one_operand_p = true;
   23702        15709 :   dsecond.op0 = dsecond.op1;
   23703        15709 :   dsecond.one_operand_p = true;
   23704              : 
   23705       116717 :   for (i = 0; i < nelt; ++i)
   23706       101008 :     if (d->perm[i] >= nelt)
   23707              :       {
   23708        51027 :         dfirst.perm[i] = 0xff;
   23709        51027 :         dsecond.perm[i] = d->perm[i] - nelt;
   23710        51027 :         if (d->perm[i] != i + nelt)
   23711       101008 :           ident2 = false;
   23712              :       }
   23713              :     else
   23714              :       {
   23715        49981 :         dsecond.perm[i] = 0xff;
   23716        49981 :         dfirst.perm[i] = d->perm[i];
   23717        49981 :         if (d->perm[i] != i)
   23718       101008 :           ident1 = false;
   23719              :       }
   23720              : 
   23721        15709 :   if (two_insn && !ident1 && !ident2)
   23722              :     return false;
   23723              : 
   23724              :   /* For now.  Ideally treat 0xff as a wildcard.  */
   23725        44289 :   for (i = 0; i < nelt; ++i)
   23726        38888 :     if (dfirst.perm[i] == 0xff)
   23727              :       {
   23728        20736 :         if (GET_MODE_SIZE (vmode) == 32
   23729        20736 :             && dfirst.perm[i ^ (nelt / 2)] != 0xff)
   23730        11732 :           dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
   23731              :         else
   23732         9004 :           dfirst.perm[i] = i;
   23733              :       }
   23734              :     else
   23735              :       {
   23736        18152 :         if (GET_MODE_SIZE (vmode) == 32
   23737        18152 :             && dsecond.perm[i ^ (nelt / 2)] != 0xff)
   23738         9964 :           dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
   23739              :         else
   23740         8188 :           dsecond.perm[i] = i;
   23741              :       }
   23742              : 
   23743         5401 :   if (!d->testing_p)
   23744              :     {
   23745         2169 :       if (!ident1)
   23746         2045 :         dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
   23747         2169 :       if (!ident2)
   23748          855 :         dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
   23749              :     }
   23750              : 
   23751         5401 :   bool ok;
   23752         5401 :   rtx_insn *seq1 = NULL, *seq2 = NULL;
   23753              : 
   23754         5401 :   if (!ident1)
   23755              :     {
   23756         4812 :       start_sequence ();
   23757         4812 :       ok = expand_vec_perm_1 (&dfirst);
   23758         4812 :       seq1 = end_sequence ();
   23759              : 
   23760         4812 :       if (!ok)
   23761              :         return false;
   23762              :     }
   23763              : 
   23764         4014 :   if (!ident2)
   23765              :     {
   23766         1133 :       start_sequence ();
   23767         1133 :       ok = expand_vec_perm_1 (&dsecond);
   23768         1133 :       seq2 = end_sequence ();
   23769              : 
   23770         1133 :       if (!ok)
   23771              :         return false;
   23772              :     }
   23773              : 
   23774         3425 :   if (d->testing_p)
   23775              :     return true;
   23776              : 
   23777        14047 :   for (i = 0; i < nelt; ++i)
   23778        12220 :     dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
   23779              : 
   23780         1827 :   emit_insn (seq1);
   23781         1827 :   emit_insn (seq2);
   23782         1827 :   ok = expand_vec_perm_blend (&dfinal);
   23783         1827 :   gcc_assert (ok);
   23784              :   return true;
   23785              : }
   23786              : 
   23787              : /* A subroutine of ix86_expand_vec_perm_const_1.
   23788              :    Implement a permutation with psrlw, psllw and por.
   23789              :    It handles case:
   23790              :    __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
   23791              :    __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
   23792              : 
   23793              : static bool
   23794        26415 : expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
   23795              : {
   23796        26415 :   unsigned i;
   23797        26415 :   rtx (*gen_shr) (rtx, rtx, rtx);
   23798        26415 :   rtx (*gen_shl) (rtx, rtx, rtx);
   23799        26415 :   rtx (*gen_or) (rtx, rtx, rtx);
   23800        26415 :   machine_mode mode = VOIDmode;
   23801              : 
   23802        26415 :   if (!TARGET_SSE2 || !d->one_operand_p)
   23803              :     return false;
   23804              : 
   23805         5267 :   switch (d->vmode)
   23806              :     {
   23807         1410 :     case E_V8QImode:
   23808         1410 :       if (!TARGET_MMX_WITH_SSE)
   23809              :         return false;
   23810              :       mode = V4HImode;
   23811              :       gen_shr = gen_lshrv4hi3;
   23812              :       gen_shl = gen_ashlv4hi3;
   23813              :       gen_or = gen_iorv4hi3;
   23814              :       break;
   23815              :     case E_V16QImode:
   23816              :       mode = V8HImode;
   23817              :       gen_shr = gen_lshrv8hi3;
   23818              :       gen_shl = gen_ashlv8hi3;
   23819              :       gen_or = gen_iorv8hi3;
   23820              :       break;
   23821              :     default: return false;
   23822              :     }
   23823              : 
   23824         3141 :   if (!rtx_equal_p (d->op0, d->op1))
   23825              :     return false;
   23826              : 
   23827        12181 :   for (i = 0; i < d->nelt; i += 2)
   23828        10743 :     if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
   23829              :       return false;
   23830              : 
   23831         1438 :   if (d->testing_p)
   23832              :     return true;
   23833              : 
   23834           26 :   rtx tmp1 = gen_reg_rtx (mode);
   23835           26 :   rtx tmp2 = gen_reg_rtx (mode);
   23836           26 :   rtx op0 = force_reg (d->vmode, d->op0);
   23837              : 
   23838           26 :   emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
   23839           26 :   emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
   23840           26 :   emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
   23841           26 :   emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
   23842           26 :   emit_insn (gen_or (tmp1, tmp1, tmp2));
   23843           26 :   emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
   23844              : 
   23845           26 :   return true;
   23846              : }
   23847              : 
   23848              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
   23849              :    permutation using two vperm2f128, followed by a vshufpd insn blending
   23850              :    the two vectors together.  */
   23851              : 
   23852              : static bool
   23853        30181 : expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
   23854              : {
   23855        30181 :   struct expand_vec_perm_d dfirst, dsecond, dthird;
   23856        30181 :   bool ok;
   23857              : 
   23858        30181 :   if (!TARGET_AVX || (d->vmode != V4DFmode))
   23859              :     return false;
   23860              : 
   23861         1213 :   if (d->testing_p)
   23862              :     return true;
   23863              : 
   23864          190 :   dfirst = *d;
   23865          190 :   dsecond = *d;
   23866          190 :   dthird = *d;
   23867              : 
   23868          190 :   dfirst.perm[0] = (d->perm[0] & ~1);
   23869          190 :   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
   23870          190 :   dfirst.perm[2] = (d->perm[2] & ~1);
   23871          190 :   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
   23872          190 :   dsecond.perm[0] = (d->perm[1] & ~1);
   23873          190 :   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
   23874          190 :   dsecond.perm[2] = (d->perm[3] & ~1);
   23875          190 :   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
   23876          190 :   dthird.perm[0] = (d->perm[0] % 2);
   23877          190 :   dthird.perm[1] = (d->perm[1] % 2) + 4;
   23878          190 :   dthird.perm[2] = (d->perm[2] % 2) + 2;
   23879          190 :   dthird.perm[3] = (d->perm[3] % 2) + 6;
   23880              : 
   23881          190 :   dfirst.target = gen_reg_rtx (dfirst.vmode);
   23882          190 :   dsecond.target = gen_reg_rtx (dsecond.vmode);
   23883          190 :   dthird.op0 = dfirst.target;
   23884          190 :   dthird.op1 = dsecond.target;
   23885          190 :   dthird.one_operand_p = false;
   23886              : 
   23887          190 :   canonicalize_perm (&dfirst);
   23888          190 :   canonicalize_perm (&dsecond);
   23889              : 
   23890          190 :   ok = expand_vec_perm_1 (&dfirst)
   23891          190 :        && expand_vec_perm_1 (&dsecond)
   23892          380 :        && expand_vec_perm_1 (&dthird);
   23893              : 
   23894            0 :   gcc_assert (ok);
   23895              : 
   23896              :   return true;
   23897              : }
   23898              : 
   23899              : static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
   23900              : 
   23901              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23902              :    a two vector permutation using two intra-lane vector
   23903              :    permutations, vperm2f128 swapping the lanes and vblend* insn blending
   23904              :    the non-swapped and swapped vectors together.  */
   23905              : 
   23906              : static bool
   23907        15639 : expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
   23908              : {
   23909        15639 :   struct expand_vec_perm_d dfirst, dsecond, dthird;
   23910        15639 :   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
   23911        15639 :   rtx_insn *seq1, *seq2;
   23912        15639 :   bool ok;
   23913        15639 :   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
   23914              : 
   23915        15639 :   if (!TARGET_AVX
   23916          794 :       || TARGET_AVX2
   23917          530 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
   23918          403 :       || d->one_operand_p)
   23919              :     return false;
   23920              : 
   23921          403 :   dfirst = *d;
   23922          403 :   dsecond = *d;
   23923         3627 :   for (i = 0; i < nelt; i++)
   23924              :     {
   23925         3224 :       dfirst.perm[i] = 0xff;
   23926         3224 :       dsecond.perm[i] = 0xff;
   23927              :     }
   23928         3627 :   for (i = 0, msk = 0; i < nelt; i++)
   23929              :     {
   23930         3224 :       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
   23931         3224 :       if (j == i)
   23932              :         {
   23933         2498 :           dfirst.perm[j] = d->perm[i];
   23934         4322 :           which1 |= (d->perm[i] < nelt ? 1 : 2);
   23935              :         }
   23936              :       else
   23937              :         {
   23938          726 :           dsecond.perm[j] = d->perm[i];
   23939          726 :           which2 |= (d->perm[i] < nelt ? 1 : 2);
   23940          726 :           msk |= (1U << i);
   23941              :         }
   23942              :     }
   23943          403 :   if (msk == 0 || msk == (1U << nelt) - 1)
   23944              :     return false;
   23945              : 
   23946          403 :   if (!d->testing_p)
   23947              :     {
   23948           40 :       dfirst.target = gen_reg_rtx (dfirst.vmode);
   23949           40 :       dsecond.target = gen_reg_rtx (dsecond.vmode);
   23950              :     }
   23951              : 
   23952         3627 :   for (i = 0; i < nelt; i++)
   23953              :     {
   23954         3224 :       if (dfirst.perm[i] == 0xff)
   23955          726 :         dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
   23956         3224 :       if (dsecond.perm[i] == 0xff)
   23957         2498 :         dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
   23958              :     }
   23959          403 :   canonicalize_perm (&dfirst);
   23960          403 :   start_sequence ();
   23961          403 :   ok = ix86_expand_vec_perm_const_1 (&dfirst);
   23962          403 :   seq1 = end_sequence ();
   23963              : 
   23964          403 :   if (!ok)
   23965              :     return false;
   23966              : 
   23967          403 :   canonicalize_perm (&dsecond);
   23968          403 :   start_sequence ();
   23969          403 :   ok = ix86_expand_vec_perm_const_1 (&dsecond);
   23970          403 :   seq2 = end_sequence ();
   23971              : 
   23972          403 :   if (!ok)
   23973              :     return false;
   23974              : 
   23975          403 :   if (d->testing_p)
   23976              :     return true;
   23977              : 
   23978           40 :   emit_insn (seq1);
   23979           40 :   emit_insn (seq2);
   23980              : 
   23981           40 :   dthird = *d;
   23982           40 :   dthird.op0 = dsecond.target;
   23983           40 :   dthird.op1 = dsecond.target;
   23984           40 :   dthird.one_operand_p = true;
   23985           40 :   dthird.target = gen_reg_rtx (dthird.vmode);
   23986          360 :   for (i = 0; i < nelt; i++)
   23987          320 :     dthird.perm[i] = i ^ nelt2;
   23988              : 
   23989           40 :   ok = expand_vec_perm_1 (&dthird);
   23990           40 :   gcc_assert (ok);
   23991              : 
   23992           40 :   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
   23993           40 :   emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
   23994           40 :   return true;
   23995              : }
   23996              : 
   23997              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
   23998              :    permutation with two pshufb insns and an ior.  We should have already
   23999              :    failed all two instruction sequences.  */
   24000              : 
   24001              : static bool
   24002        28989 : expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
   24003              : {
   24004        28989 :   rtx rperm[2][16], vperm, l, h, op, m128;
   24005        28989 :   unsigned int i, nelt, eltsz;
   24006        28989 :   machine_mode mode;
   24007        28989 :   rtx (*gen) (rtx, rtx, rtx);
   24008              : 
   24009        33469 :   if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
   24010         8870 :                         && GET_MODE_SIZE (d->vmode) != 8
   24011         8830 :                         && GET_MODE_SIZE (d->vmode) != 4))
   24012              :     return false;
   24013         1409 :   gcc_assert (!d->one_operand_p);
   24014              : 
   24015         1409 :   if (d->testing_p)
   24016              :     return true;
   24017              : 
   24018          202 :   switch (GET_MODE_SIZE (d->vmode))
   24019              :     {
   24020              :     case 4:
   24021              :       mode = V4QImode;
   24022              :       gen = gen_mmx_pshufbv4qi3;
   24023              :       break;
   24024           20 :     case 8:
   24025           20 :       mode = V8QImode;
   24026           20 :       gen = gen_mmx_pshufbv8qi3;
   24027           20 :       break;
   24028           45 :     case 16:
   24029           45 :       mode = V16QImode;
   24030           45 :       gen = gen_ssse3_pshufbv16qi3;
   24031           45 :       break;
   24032            0 :     default:
   24033            0 :       gcc_unreachable ();
   24034              :     }
   24035              : 
   24036          101 :   nelt = d->nelt;
   24037          101 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   24038              : 
   24039              :   /* Generate two permutation masks.  If the required element is within
   24040              :      the given vector it is shuffled into the proper lane.  If the required
   24041              :      element is in the other vector, force a zero into the lane by setting
   24042              :      bit 7 in the permutation mask.  */
   24043          101 :   m128 = GEN_INT (-128);
   24044         1029 :   for (i = 0; i < nelt; ++i)
   24045              :     {
   24046          928 :       unsigned j, k, e = d->perm[i];
   24047          928 :       unsigned which = (e >= nelt);
   24048          928 :       if (e >= nelt)
   24049          480 :         e -= nelt;
   24050              : 
   24051         1952 :       for (j = 0; j < eltsz; ++j)
   24052              :         {
   24053         1024 :           rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
   24054         1024 :           rperm[1-which][i*eltsz + j] = m128;
   24055              :         }
   24056              : 
   24057         9024 :       for (k = i*eltsz + j; k < 16; ++k)
   24058         8096 :         rperm[0][k] = rperm[1][k] = m128;
   24059              :     }
   24060              : 
   24061          101 :   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
   24062          101 :   vperm = force_reg (V16QImode, vperm);
   24063              : 
   24064          101 :   l = gen_reg_rtx (mode);
   24065          101 :   op = gen_lowpart (mode, d->op0);
   24066          101 :   emit_insn (gen (l, op, vperm));
   24067              : 
   24068          101 :   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
   24069          101 :   vperm = force_reg (V16QImode, vperm);
   24070              : 
   24071          101 :   h = gen_reg_rtx (mode);
   24072          101 :   op = gen_lowpart (mode, d->op1);
   24073          101 :   emit_insn (gen (h, op, vperm));
   24074              : 
   24075          101 :   op = d->target;
   24076          101 :   if (d->vmode != mode)
   24077           22 :     op = gen_reg_rtx (mode);
   24078          101 :   ix86_emit_vec_binop (IOR, mode, op, l, h);
   24079          101 :   if (op != d->target)
   24080           22 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   24081              : 
   24082              :   return true;
   24083              : }
   24084              : 
   24085              : /* Implement arbitrary permutation of one V32QImode and V16QImode operand
   24086              :    with two vpshufb insns, vpermq and vpor.  We should have already failed
   24087              :    all two or three instruction sequences.  */
   24088              : 
   24089              : static bool
   24090        23895 : expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
   24091              : {
   24092        23895 :   rtx rperm[2][32], vperm, l, h, hp, op, m128;
   24093        23895 :   unsigned int i, nelt, eltsz;
   24094              : 
   24095        23895 :   if (!TARGET_AVX2
   24096          374 :       || !d->one_operand_p
   24097          172 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   24098              :     return false;
   24099              : 
   24100            7 :   if (d->testing_p)
   24101              :     return true;
   24102              : 
   24103            7 :   nelt = d->nelt;
   24104            7 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   24105              : 
   24106              :   /* Generate two permutation masks.  If the required element is within
   24107              :      the same lane, it is shuffled in.  If the required element from the
   24108              :      other lane, force a zero by setting bit 7 in the permutation mask.
   24109              :      In the other mask the mask has non-negative elements if element
   24110              :      is requested from the other lane, but also moved to the other lane,
   24111              :      so that the result of vpshufb can have the two V2TImode halves
   24112              :      swapped.  */
   24113            7 :   m128 = GEN_INT (-128);
   24114          199 :   for (i = 0; i < nelt; ++i)
   24115              :     {
   24116          192 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   24117          192 :       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
   24118              : 
   24119          416 :       for (j = 0; j < eltsz; ++j)
   24120              :         {
   24121          224 :           rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
   24122          224 :           rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
   24123              :         }
   24124              :     }
   24125              : 
   24126            7 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
   24127            7 :   vperm = force_reg (V32QImode, vperm);
   24128              : 
   24129            7 :   h = gen_reg_rtx (V32QImode);
   24130            7 :   op = gen_lowpart (V32QImode, d->op0);
   24131            7 :   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
   24132              : 
   24133              :   /* Swap the 128-byte lanes of h into hp.  */
   24134            7 :   hp = gen_reg_rtx (V4DImode);
   24135            7 :   op = gen_lowpart (V4DImode, h);
   24136            7 :   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
   24137              :                                   const1_rtx));
   24138              : 
   24139            7 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
   24140            7 :   vperm = force_reg (V32QImode, vperm);
   24141              : 
   24142            7 :   l = gen_reg_rtx (V32QImode);
   24143            7 :   op = gen_lowpart (V32QImode, d->op0);
   24144            7 :   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
   24145              : 
   24146            7 :   op = d->target;
   24147            7 :   if (d->vmode != V32QImode)
   24148            2 :     op = gen_reg_rtx (V32QImode);
   24149            7 :   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
   24150            7 :   if (op != d->target)
   24151            2 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   24152              : 
   24153              :   return true;
   24154              : }
   24155              : 
   24156              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24157              :    and extract-odd permutations of two V32QImode and V16QImode operand
   24158              :    with two vpshufb insns, vpor and vpermq.  We should have already
   24159              :    failed all two or three instruction sequences.  */
   24160              : 
   24161              : static bool
   24162        23888 : expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
   24163              : {
   24164        23888 :   rtx rperm[2][32], vperm, l, h, ior, op, m128;
   24165        23888 :   unsigned int i, nelt, eltsz;
   24166              : 
   24167        23888 :   if (!TARGET_AVX2
   24168          367 :       || d->one_operand_p
   24169          202 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   24170              :     return false;
   24171              : 
   24172          112 :   for (i = 0; i < d->nelt; ++i)
   24173          112 :     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
   24174              :       return false;
   24175              : 
   24176            0 :   if (d->testing_p)
   24177              :     return true;
   24178              : 
   24179            0 :   nelt = d->nelt;
   24180            0 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   24181              : 
   24182              :   /* Generate two permutation masks.  In the first permutation mask
   24183              :      the first quarter will contain indexes for the first half
   24184              :      of the op0, the second quarter will contain bit 7 set, third quarter
   24185              :      will contain indexes for the second half of the op0 and the
   24186              :      last quarter bit 7 set.  In the second permutation mask
   24187              :      the first quarter will contain bit 7 set, the second quarter
   24188              :      indexes for the first half of the op1, the third quarter bit 7 set
   24189              :      and last quarter indexes for the second half of the op1.
   24190              :      I.e. the first mask e.g. for V32QImode extract even will be:
   24191              :      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
   24192              :      (all values masked with 0xf except for -128) and second mask
   24193              :      for extract even will be
   24194              :      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
   24195            0 :   m128 = GEN_INT (-128);
   24196            0 :   for (i = 0; i < nelt; ++i)
   24197              :     {
   24198            0 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   24199            0 :       unsigned which = d->perm[i] >= nelt;
   24200            0 :       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
   24201              : 
   24202            0 :       for (j = 0; j < eltsz; ++j)
   24203              :         {
   24204            0 :           rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
   24205            0 :           rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
   24206              :         }
   24207              :     }
   24208              : 
   24209            0 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
   24210            0 :   vperm = force_reg (V32QImode, vperm);
   24211              : 
   24212            0 :   l = gen_reg_rtx (V32QImode);
   24213            0 :   op = gen_lowpart (V32QImode, d->op0);
   24214            0 :   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
   24215              : 
   24216            0 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
   24217            0 :   vperm = force_reg (V32QImode, vperm);
   24218              : 
   24219            0 :   h = gen_reg_rtx (V32QImode);
   24220            0 :   op = gen_lowpart (V32QImode, d->op1);
   24221            0 :   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
   24222              : 
   24223            0 :   ior = gen_reg_rtx (V32QImode);
   24224            0 :   emit_insn (gen_iorv32qi3 (ior, l, h));
   24225              : 
   24226              :   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
   24227            0 :   op = gen_reg_rtx (V4DImode);
   24228            0 :   ior = gen_lowpart (V4DImode, ior);
   24229            0 :   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
   24230              :                                   const1_rtx, GEN_INT (3)));
   24231            0 :   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   24232              : 
   24233            0 :   return true;
   24234              : }
   24235              : 
   24236              : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
   24237              :    permutation (which is a bland) with and, andnot and or when pshufb is not available.
   24238              : 
   24239              :    It handles case:
   24240              :    __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
   24241              :    __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
   24242              : 
   24243              :    An element[i] must be chosen between op0[i] and op1[i] to satisfy the
   24244              :    requirement.
   24245              :  */
   24246              : 
   24247              : static bool
   24248        24977 : expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
   24249              : {
   24250        24977 :   rtx rperm[16], vperm;
   24251        24977 :   unsigned int i, nelt = d->nelt;
   24252              : 
   24253        24977 :   if (!TARGET_SSE2
   24254        24977 :       || d->one_operand_p
   24255        21148 :       || (d->vmode != V16QImode && d->vmode != V8HImode))
   24256              :     return false;
   24257              : 
   24258         7743 :   if (d->perm[0] != 0)
   24259              :     return false;
   24260              : 
   24261              :   /* The dest[i] must select an element between op0[i] and op1[i].  */
   24262        16310 :   for (i = 1; i < nelt; i++)
   24263        15240 :     if ((d->perm[i] % nelt) != i)
   24264              :       return false;
   24265              : 
   24266         1070 :   if (d->testing_p)
   24267              :      return true;
   24268              : 
   24269              :   /* Generates a blend mask for the operators AND and ANDNOT.  */
   24270          121 :   machine_mode inner_mode = GET_MODE_INNER (d->vmode);
   24271         1337 :   for (i = 0; i < nelt; i++)
   24272         1790 :     rperm[i] = (d->perm[i] <  nelt) ? CONSTM1_RTX (inner_mode)
   24273          574 :       : CONST0_RTX (inner_mode);
   24274              : 
   24275          121 :   vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
   24276          121 :   vperm = force_reg (d->vmode, vperm);
   24277              : 
   24278          121 :   ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
   24279              : 
   24280          121 :   return true;
   24281              : }
   24282              : 
   24283              : /* Implement permutation with pslldq + psrldq + por when pshufb is not
   24284              :    available.  */
   24285              : static bool
   24286        43553 : expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
   24287              : {
   24288        43553 :   unsigned i, nelt = d->nelt;
   24289        43553 :   unsigned start1, end1 = -1;
   24290        43553 :   machine_mode vmode = d->vmode, imode;
   24291        43553 :   int start2 = -1;
   24292        43553 :   bool clear_op0, clear_op1;
   24293        43553 :   unsigned inner_size;
   24294        43553 :   rtx op0, op1, dop1;
   24295        43553 :   rtx (*gen_vec_shr) (rtx, rtx, rtx);
   24296        43553 :   rtx (*gen_vec_shl) (rtx, rtx, rtx);
   24297              : 
   24298              :   /* pshufd can be used for V4SI/V2DI under TARGET_SSE2.  */
   24299        43553 :   if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
   24300              :     return false;
   24301              : 
   24302        13800 :   start1 = d->perm[0];
   24303        38746 :   for (i = 1; i < nelt; i++)
   24304              :     {
   24305        37934 :       if (d->perm[i] != d->perm[i-1] + 1
   24306        11392 :           || d->perm[i] == nelt)
   24307              :         {
   24308        26788 :           if (start2 == -1)
   24309              :             {
   24310        13800 :               start2 = d->perm[i];
   24311        13800 :               end1 = d->perm[i-1];
   24312              :             }
   24313              :           else
   24314              :             return false;
   24315              :         }
   24316              :     }
   24317              : 
   24318          812 :   clear_op0 = end1 != nelt - 1;
   24319          812 :   clear_op1 = start2 % nelt != 0;
   24320              :   /* pandn/pand is needed to clear upper/lower bits of op0/op1.  */
   24321          812 :   if (!pandn && (clear_op0 || clear_op1))
   24322              :     return false;
   24323              : 
   24324          523 :   if (d->testing_p)
   24325              :     return true;
   24326              : 
   24327           65 :   gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
   24328           24 :   gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
   24329           65 :   imode = GET_MODE_INNER (vmode);
   24330           65 :   inner_size = GET_MODE_BITSIZE (imode);
   24331           65 :   op0 = gen_reg_rtx (vmode);
   24332           65 :   op1 = gen_reg_rtx (vmode);
   24333              : 
   24334           65 :   if (start1)
   24335           61 :     emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
   24336              :   else
   24337            4 :     emit_move_insn (op0, d->op0);
   24338              : 
   24339           65 :   dop1 = d->op1;
   24340           65 :   if (d->one_operand_p)
   24341           44 :     dop1 = d->op0;
   24342              : 
   24343           65 :   int shl_offset = end1 - start1 + 1 - start2 % nelt;
   24344           65 :   if (shl_offset)
   24345           45 :     emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
   24346              :   else
   24347           20 :     emit_move_insn (op1, dop1);
   24348              : 
   24349              :   /* Clear lower/upper bits for op0/op1.  */
   24350           65 :   if (clear_op0 || clear_op1)
   24351              :     {
   24352              :       rtx vec[16];
   24353              :       rtx const_vec;
   24354              :       rtx clear;
   24355          664 :       for (i = 0; i != nelt; i++)
   24356              :         {
   24357          616 :           if (i < (end1 - start1 + 1))
   24358          251 :             vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
   24359              :           else
   24360          365 :             vec[i] = CONST0_RTX (imode);
   24361              :         }
   24362           48 :       const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
   24363           48 :       const_vec = validize_mem (force_const_mem (vmode, const_vec));
   24364           48 :       clear = force_reg (vmode, const_vec);
   24365              : 
   24366           48 :       if (clear_op0)
   24367           40 :         emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
   24368           48 :       if (clear_op1)
   24369           36 :         emit_move_insn (op1, gen_rtx_AND (vmode,
   24370              :                                           gen_rtx_NOT (vmode, clear),
   24371              :                                           op1));
   24372              :     }
   24373              : 
   24374           65 :   emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
   24375           65 :   return true;
   24376              : }
   24377              : 
   24378              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24379              :    and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
   24380              :    operands with two "and" and "pack" or two "shift" and "pack" insns.
   24381              :    We should have already failed all two instruction sequences.  */
   24382              : 
   24383              : static bool
   24384        46304 : expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
   24385              : {
   24386        46304 :   rtx op, dop0, dop1, t;
   24387        46304 :   unsigned i, odd, c, s, nelt = d->nelt;
   24388        46304 :   int pblendw_i = 0;
   24389        46304 :   bool end_perm = false;
   24390        46304 :   machine_mode half_mode;
   24391        46304 :   rtx (*gen_and) (rtx, rtx, rtx);
   24392        46304 :   rtx (*gen_pack) (rtx, rtx, rtx);
   24393        46304 :   rtx (*gen_shift) (rtx, rtx, rtx);
   24394              : 
   24395        46304 :   if (d->one_operand_p)
   24396              :     return false;
   24397              : 
   24398        40994 :   switch (d->vmode)
   24399              :     {
   24400         4471 :     case E_V4HImode:
   24401              :       /* Required for "pack".  */
   24402         4471 :       if (!TARGET_SSE4_1)
   24403              :         return false;
   24404              :       c = 0xffff;
   24405              :       s = 16;
   24406              :       half_mode = V2SImode;
   24407              :       gen_and = gen_andv2si3;
   24408              :       gen_pack = gen_mmx_packusdw;
   24409              :       gen_shift = gen_lshrv2si3;
   24410              :       pblendw_i = 0x5;
   24411              :       break;
   24412         5931 :     case E_V8HImode:
   24413              :       /* Required for "pack".  */
   24414         5931 :       if (!TARGET_SSE4_1)
   24415              :         return false;
   24416              :       c = 0xffff;
   24417              :       s = 16;
   24418              :       half_mode = V4SImode;
   24419              :       gen_and = gen_andv4si3;
   24420              :       gen_pack = gen_sse4_1_packusdw;
   24421              :       gen_shift = gen_lshrv4si3;
   24422              :       pblendw_i = 0x55;
   24423              :       break;
   24424              :     case E_V8QImode:
   24425              :       /* No check as all instructions are SSE2.  */
   24426              :       c = 0xff;
   24427              :       s = 8;
   24428              :       half_mode = V4HImode;
   24429              :       gen_and = gen_andv4hi3;
   24430              :       gen_pack = gen_mmx_packuswb;
   24431              :       gen_shift = gen_lshrv4hi3;
   24432              :       break;
   24433        14217 :     case E_V16QImode:
   24434              :       /* No check as all instructions are SSE2.  */
   24435        14217 :       c = 0xff;
   24436        14217 :       s = 8;
   24437        14217 :       half_mode = V8HImode;
   24438        14217 :       gen_and = gen_andv8hi3;
   24439        14217 :       gen_pack = gen_sse2_packuswb;
   24440        14217 :       gen_shift = gen_lshrv8hi3;
   24441        14217 :       break;
   24442          440 :     case E_V16HImode:
   24443          440 :       if (!TARGET_AVX2)
   24444              :         return false;
   24445              :       c = 0xffff;
   24446              :       s = 16;
   24447              :       half_mode = V8SImode;
   24448              :       gen_and = gen_andv8si3;
   24449              :       gen_pack = gen_avx2_packusdw;
   24450              :       gen_shift = gen_lshrv8si3;
   24451              :       pblendw_i = 0x5555;
   24452              :       end_perm = true;
   24453              :       break;
   24454          276 :     case E_V32QImode:
   24455          276 :       if (!TARGET_AVX2)
   24456              :         return false;
   24457              :       c = 0xff;
   24458              :       s = 8;
   24459              :       half_mode = V16HImode;
   24460              :       gen_and = gen_andv16hi3;
   24461              :       gen_pack = gen_avx2_packuswb;
   24462              :       gen_shift = gen_lshrv16hi3;
   24463              :       end_perm = true;
   24464              :       break;
   24465              :     default:
   24466              :       /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
   24467              :          are more profitable than general shuffles.  */
   24468              :       return false;
   24469              :     }
   24470              : 
   24471              :   /* Check that permutation is even or odd.  */
   24472        20066 :   odd = d->perm[0];
   24473        20066 :   if (odd > 1)
   24474              :     return false;
   24475              : 
   24476       229473 :   for (i = 1; i < nelt; ++i)
   24477       213410 :     if (d->perm[i] != 2 * i + odd)
   24478              :       return false;
   24479              : 
   24480        16063 :   if (d->testing_p)
   24481              :     return true;
   24482              : 
   24483         5511 :   dop0 = gen_reg_rtx (half_mode);
   24484         5511 :   dop1 = gen_reg_rtx (half_mode);
   24485         5511 :   if (odd == 0)
   24486              :     {
   24487              :       /* Use pblendw since const_vector 0 should be cheaper than
   24488              :          const_vector 0xffff.  */
   24489         4789 :       if (d->vmode == V4HImode
   24490              :           || d->vmode == E_V8HImode
   24491              :           || d->vmode == E_V16HImode)
   24492              :         {
   24493          872 :           rtx dop0_t = gen_reg_rtx (d->vmode);
   24494          872 :           rtx dop1_t = gen_reg_rtx (d->vmode);
   24495          872 :           t = gen_reg_rtx (d->vmode);
   24496          872 :           emit_move_insn (t, CONST0_RTX (d->vmode));
   24497              : 
   24498          872 :           emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
   24499              :                                                      GEN_INT (pblendw_i)));
   24500          872 :           emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
   24501              :                                                      GEN_INT (pblendw_i)));
   24502              : 
   24503          872 :           emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
   24504          872 :           emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
   24505          872 :         }
   24506              :       else
   24507              :         {
   24508         3917 :           t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
   24509         3917 :           t = force_reg (half_mode, t);
   24510         3917 :           emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
   24511         3917 :           emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
   24512              :         }
   24513              :     }
   24514              :   else
   24515              :     {
   24516         1444 :       emit_insn (gen_shift (dop0,
   24517          722 :                             gen_lowpart (half_mode, d->op0),
   24518              :                             GEN_INT (s)));
   24519         1444 :       emit_insn (gen_shift (dop1,
   24520          722 :                             gen_lowpart (half_mode, d->op1),
   24521              :                             GEN_INT (s)));
   24522              :     }
   24523              :   /* In AVX2 for 256 bit case we need to permute pack result.  */
   24524         5511 :   if (TARGET_AVX2 && end_perm)
   24525              :     {
   24526          419 :       op = gen_reg_rtx (d->vmode);
   24527          419 :       t = gen_reg_rtx (V4DImode);
   24528          419 :       emit_insn (gen_pack (op, dop0, dop1));
   24529          838 :       emit_insn (gen_avx2_permv4di_1 (t,
   24530          419 :                                       gen_lowpart (V4DImode, op),
   24531              :                                       const0_rtx,
   24532              :                                       const2_rtx,
   24533              :                                       const1_rtx,
   24534              :                                       GEN_INT (3)));
   24535          419 :       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
   24536              :     }
   24537              :   else
   24538         5092 :     emit_insn (gen_pack (d->target, dop0, dop1));
   24539              : 
   24540              :   return true;
   24541              : }
   24542              : 
   24543              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24544              :    and extract-odd permutations of two V64QI operands
   24545              :    with two "shifts", two "truncs" and one "concat" insns for "odd"
   24546              :    and two "truncs" and one concat insn for "even."
   24547              :    Have already failed all two instruction sequences.  */
   24548              : 
   24549              : static bool
   24550        23932 : expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
   24551              : {
   24552        23932 :   rtx t1, t2, t3, t4;
   24553        23932 :   unsigned i, odd, nelt = d->nelt;
   24554              : 
   24555        23932 :   if (!TARGET_AVX512BW
   24556           74 :       || d->one_operand_p
   24557           38 :       || d->vmode != V64QImode)
   24558              :     return false;
   24559              : 
   24560              :   /* Check that permutation is even or odd.  */
   24561           38 :   odd = d->perm[0];
   24562           38 :   if (odd > 1)
   24563              :     return false;
   24564              : 
   24565         1662 :   for (i = 1; i < nelt; ++i)
   24566         1637 :     if (d->perm[i] != 2 * i + odd)
   24567              :       return false;
   24568              : 
   24569           25 :   if (d->testing_p)
   24570              :     return true;
   24571              : 
   24572              : 
   24573           25 :   if (odd)
   24574              :     {
   24575            5 :       t1 = gen_reg_rtx (V32HImode);
   24576            5 :       t2 = gen_reg_rtx (V32HImode);
   24577           10 :       emit_insn (gen_lshrv32hi3 (t1,
   24578            5 :                                  gen_lowpart (V32HImode, d->op0),
   24579              :                                  GEN_INT (8)));
   24580           10 :       emit_insn (gen_lshrv32hi3 (t2,
   24581            5 :                                  gen_lowpart (V32HImode, d->op1),
   24582              :                                  GEN_INT (8)));
   24583              :     }
   24584              :   else
   24585              :     {
   24586           20 :       t1 = gen_lowpart (V32HImode, d->op0);
   24587           20 :       t2 = gen_lowpart (V32HImode, d->op1);
   24588              :     }
   24589              : 
   24590           25 :   t3 = gen_reg_rtx (V32QImode);
   24591           25 :   t4 = gen_reg_rtx (V32QImode);
   24592           25 :   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
   24593           25 :   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
   24594           25 :   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
   24595              : 
   24596           25 :   return true;
   24597              : }
   24598              : 
   24599              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
   24600              :    and extract-odd permutations.  */
   24601              : 
   24602              : static bool
   24603        12950 : expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
   24604              : {
   24605        12950 :   rtx t1, t2, t3, t4, t5;
   24606              : 
   24607        12950 :   switch (d->vmode)
   24608              :     {
   24609           19 :     case E_V4DFmode:
   24610           19 :       if (d->testing_p)
   24611              :         break;
   24612            1 :       t1 = gen_reg_rtx (V4DFmode);
   24613            1 :       t2 = gen_reg_rtx (V4DFmode);
   24614              : 
   24615              :       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
   24616            1 :       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
   24617            1 :       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
   24618              : 
   24619              :       /* Now an unpck[lh]pd will produce the result required.  */
   24620            1 :       if (odd)
   24621            0 :         t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
   24622              :       else
   24623            1 :         t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
   24624            1 :       emit_insn (t3);
   24625            1 :       break;
   24626              : 
   24627         1214 :     case E_V8SFmode:
   24628         1214 :       {
   24629         1214 :         int mask = odd ? 0xdd : 0x88;
   24630              : 
   24631         1214 :         if (d->testing_p)
   24632              :           break;
   24633          186 :         t1 = gen_reg_rtx (V8SFmode);
   24634          186 :         t2 = gen_reg_rtx (V8SFmode);
   24635          186 :         t3 = gen_reg_rtx (V8SFmode);
   24636              : 
   24637              :         /* Shuffle within the 128-bit lanes to produce:
   24638              :            { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
   24639          186 :         emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
   24640              :                                       GEN_INT (mask)));
   24641              : 
   24642              :         /* Shuffle the lanes around to produce:
   24643              :            { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
   24644          186 :         emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
   24645              :                                             GEN_INT (0x3)));
   24646              : 
   24647              :         /* Shuffle within the 128-bit lanes to produce:
   24648              :            { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
   24649          186 :         emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
   24650              : 
   24651              :         /* Shuffle within the 128-bit lanes to produce:
   24652              :            { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
   24653          186 :         emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
   24654              : 
   24655              :         /* Shuffle the lanes around to produce:
   24656              :            { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
   24657          186 :         emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
   24658              :                                             GEN_INT (0x20)));
   24659              :       }
   24660          186 :       break;
   24661              : 
   24662            0 :     case E_V2DFmode:
   24663            0 :     case E_V4SFmode:
   24664            0 :     case E_V2DImode:
   24665            0 :     case E_V2SImode:
   24666            0 :     case E_V4SImode:
   24667            0 :     case E_V2HImode:
   24668              :       /* These are always directly implementable by expand_vec_perm_1.  */
   24669            0 :       gcc_unreachable ();
   24670              : 
   24671            0 :     case E_V2SFmode:
   24672            0 :       gcc_assert (TARGET_MMX_WITH_SSE);
   24673              :       /* We have no suitable instructions.  */
   24674            0 :       if (d->testing_p)
   24675              :         return false;
   24676              :       break;
   24677              : 
   24678         1550 :     case E_V4QImode:
   24679         1550 :       if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24680            0 :         return expand_vec_perm_pshufb2 (d);
   24681              :       else
   24682              :         {
   24683         1550 :           if (d->testing_p)
   24684              :             break;
   24685              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24686              :              with interleave. */
   24687          178 :           t1 = gen_reg_rtx (V4QImode);
   24688          178 :           emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
   24689          178 :           emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
   24690          178 :           if (odd)
   24691           41 :             t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
   24692              :           else
   24693          137 :             t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
   24694          178 :           emit_insn (t2);
   24695              :         }
   24696          178 :       break;
   24697              : 
   24698         1527 :     case E_V4HImode:
   24699         1527 :       if (TARGET_SSE4_1)
   24700           92 :         return expand_vec_perm_even_odd_pack (d);
   24701         1435 :       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24702           20 :         return expand_vec_perm_pshufb2 (d);
   24703              :       else
   24704              :         {
   24705         1415 :           if (d->testing_p)
   24706              :             break;
   24707              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24708              :              with interleave. */
   24709          454 :           t1 = gen_reg_rtx (V4HImode);
   24710          454 :           emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
   24711          454 :           emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
   24712          454 :           if (odd)
   24713            8 :             t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
   24714              :           else
   24715          446 :             t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
   24716          454 :           emit_insn (t2);
   24717              :         }
   24718          454 :       break;
   24719              : 
   24720         6633 :     case E_V8HImode:
   24721         6633 :       if (TARGET_SSE4_1)
   24722          440 :         return expand_vec_perm_even_odd_pack (d);
   24723         6193 :       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24724            1 :         return expand_vec_perm_pshufb2 (d);
   24725              :       else
   24726              :         {
   24727         6192 :           if (d->testing_p)
   24728              :             break;
   24729              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24730              :              with interleave. */
   24731         2698 :           t1 = gen_reg_rtx (V8HImode);
   24732         2698 :           t2 = gen_reg_rtx (V8HImode);
   24733         2698 :           emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
   24734         2698 :           emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
   24735         2698 :           emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
   24736         2698 :           emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
   24737         2698 :           if (odd)
   24738           92 :             t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
   24739              :           else
   24740         2606 :             t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
   24741         2698 :           emit_insn (t3);
   24742              :         }
   24743         2698 :       break;
   24744              : 
   24745         1320 :     case E_V8QImode:
   24746         1320 :     case E_V16QImode:
   24747         1320 :       return expand_vec_perm_even_odd_pack (d);
   24748              : 
   24749          467 :     case E_V16HImode:
   24750          467 :     case E_V32QImode:
   24751          467 :       return expand_vec_perm_even_odd_pack (d);
   24752              : 
   24753           25 :     case E_V64QImode:
   24754           25 :       return expand_vec_perm_even_odd_trunc (d);
   24755              : 
   24756           19 :     case E_V4DImode:
   24757           19 :       if (!TARGET_AVX2)
   24758              :         {
   24759           19 :           struct expand_vec_perm_d d_copy = *d;
   24760           19 :           d_copy.vmode = V4DFmode;
   24761           19 :           if (d->testing_p)
   24762           18 :             d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
   24763              :           else
   24764            1 :             d_copy.target = gen_reg_rtx (V4DFmode);
   24765           19 :           d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
   24766           19 :           d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
   24767           19 :           if (expand_vec_perm_even_odd_1 (&d_copy, odd))
   24768              :             {
   24769           19 :               if (!d->testing_p)
   24770            1 :                 emit_move_insn (d->target,
   24771            1 :                                 gen_lowpart (V4DImode, d_copy.target));
   24772           19 :               return true;
   24773              :             }
   24774              :           return false;
   24775              :         }
   24776              : 
   24777            0 :       if (d->testing_p)
   24778              :         break;
   24779              : 
   24780            0 :       t1 = gen_reg_rtx (V4DImode);
   24781            0 :       t2 = gen_reg_rtx (V4DImode);
   24782              : 
   24783              :       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
   24784            0 :       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
   24785            0 :       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
   24786              : 
   24787              :       /* Now an vpunpck[lh]qdq will produce the result required.  */
   24788            0 :       if (odd)
   24789            0 :         t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
   24790              :       else
   24791            0 :         t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
   24792            0 :       emit_insn (t3);
   24793            0 :       break;
   24794              : 
   24795          176 :     case E_V8SImode:
   24796          176 :       if (!TARGET_AVX2)
   24797              :         {
   24798           38 :           struct expand_vec_perm_d d_copy = *d;
   24799           38 :           d_copy.vmode = V8SFmode;
   24800           38 :           if (d->testing_p)
   24801           38 :             d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
   24802              :           else
   24803            0 :             d_copy.target = gen_reg_rtx (V8SFmode);
   24804           38 :           d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
   24805           38 :           d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
   24806           38 :           if (expand_vec_perm_even_odd_1 (&d_copy, odd))
   24807              :             {
   24808           38 :               if (!d->testing_p)
   24809            0 :                 emit_move_insn (d->target,
   24810            0 :                                 gen_lowpart (V8SImode, d_copy.target));
   24811           38 :               return true;
   24812              :             }
   24813              :           return false;
   24814              :         }
   24815              : 
   24816          138 :       if (d->testing_p)
   24817              :         break;
   24818              : 
   24819          138 :       t1 = gen_reg_rtx (V8SImode);
   24820          138 :       t2 = gen_reg_rtx (V8SImode);
   24821          138 :       t3 = gen_reg_rtx (V4DImode);
   24822          138 :       t4 = gen_reg_rtx (V4DImode);
   24823          138 :       t5 = gen_reg_rtx (V4DImode);
   24824              : 
   24825              :       /* Shuffle the lanes around into
   24826              :          { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
   24827          276 :       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
   24828          138 :                                     gen_lowpart (V4DImode, d->op1),
   24829              :                                     GEN_INT (0x20)));
   24830          276 :       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
   24831          138 :                                     gen_lowpart (V4DImode, d->op1),
   24832              :                                     GEN_INT (0x31)));
   24833              : 
   24834              :       /* Swap the 2nd and 3rd position in each lane into
   24835              :          { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
   24836          138 :       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
   24837              :                                     GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
   24838          138 :       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
   24839              :                                     GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
   24840              : 
   24841              :       /* Now an vpunpck[lh]qdq will produce
   24842              :          { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
   24843          138 :       if (odd)
   24844            0 :         t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
   24845            0 :                                            gen_lowpart (V4DImode, t2));
   24846              :       else
   24847          138 :         t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
   24848          138 :                                           gen_lowpart (V4DImode, t2));
   24849          138 :       emit_insn (t3);
   24850          138 :       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
   24851          138 :       break;
   24852              : 
   24853            0 :     default:
   24854            0 :       gcc_unreachable ();
   24855              :     }
   24856              : 
   24857              :   return true;
   24858              : }
   24859              : 
   24860              : /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
   24861              :    extract-even and extract-odd permutations.  */
   24862              : 
   24863              : static bool
   24864        23839 : expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
   24865              : {
   24866        23839 :   unsigned i, odd, nelt = d->nelt;
   24867              : 
   24868        23839 :   odd = d->perm[0];
   24869        23839 :   if (odd != 0 && odd != 1)
   24870              :     return false;
   24871              : 
   24872        65500 :   for (i = 1; i < nelt; ++i)
   24873        57550 :     if (d->perm[i] != 2 * i + odd)
   24874              :       return false;
   24875              : 
   24876         7950 :   if (d->vmode == E_V32HImode
   24877           12 :       && d->testing_p
   24878           12 :       && !TARGET_AVX512BW)
   24879              :     return false;
   24880              : 
   24881         7938 :   return expand_vec_perm_even_odd_1 (d, odd);
   24882              : }
   24883              : 
   24884              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
   24885              :    permutations.  We assume that expand_vec_perm_1 has already failed.  */
   24886              : 
   24887              : static bool
   24888         1033 : expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
   24889              : {
   24890         1033 :   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
   24891         1033 :   machine_mode vmode = d->vmode;
   24892         1033 :   rtx (*gen) (rtx, rtx, rtx);
   24893         1033 :   unsigned char perm2[4];
   24894         1033 :   rtx op0 = d->op0, dest;
   24895         1033 :   bool ok;
   24896              : 
   24897         1033 :   switch (vmode)
   24898              :     {
   24899            0 :     case E_V4DFmode:
   24900            0 :     case E_V8SFmode:
   24901              :       /* These are special-cased in sse.md so that we can optionally
   24902              :          use the vbroadcast instruction.  They expand to two insns
   24903              :          if the input happens to be in a register.  */
   24904            0 :       gcc_unreachable ();
   24905              : 
   24906            0 :     case E_V2DFmode:
   24907            0 :     case E_V2SFmode:
   24908            0 :     case E_V4SFmode:
   24909            0 :     case E_V2DImode:
   24910            0 :     case E_V2SImode:
   24911            0 :     case E_V4SImode:
   24912            0 :     case E_V2HImode:
   24913            0 :     case E_V4HImode:
   24914              :       /* These are always implementable using standard shuffle patterns.  */
   24915            0 :       gcc_unreachable ();
   24916              : 
   24917           16 :     case E_V4QImode:
   24918              :       /* This can be implemented via interleave and pshuflw.  */
   24919           16 :       if (d->testing_p)
   24920              :         return true;
   24921              : 
   24922            8 :       if (elt >= nelt2)
   24923              :         {
   24924            4 :           gen = gen_mmx_punpckhbw_low;
   24925            4 :           elt -= nelt2;
   24926              :         }
   24927              :       else
   24928              :         gen = gen_mmx_punpcklbw_low;
   24929              : 
   24930            8 :       dest = gen_reg_rtx (vmode);
   24931            8 :       emit_insn (gen (dest, op0, op0));
   24932            8 :       vmode = get_mode_wider_vector (vmode);
   24933            8 :       op0 = gen_lowpart (vmode, dest);
   24934              : 
   24935            8 :       memset (perm2, elt, 2);
   24936            8 :       dest = gen_reg_rtx (vmode);
   24937            8 :       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
   24938            8 :       gcc_assert (ok);
   24939              : 
   24940            8 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24941            8 :       return true;
   24942              : 
   24943            4 :     case E_V8QImode:
   24944              :       /* This can be implemented via interleave.  We save one insn by
   24945              :          stopping once we have promoted to V2SImode and then use pshufd.  */
   24946            4 :       if (d->testing_p)
   24947              :         return true;
   24948            4 :       do
   24949              :         {
   24950            4 :           if (elt >= nelt2)
   24951              :             {
   24952            1 :               gen = vmode == V8QImode ? gen_mmx_punpckhbw
   24953              :                                       : gen_mmx_punpckhwd;
   24954            1 :               elt -= nelt2;
   24955              :             }
   24956              :           else
   24957            3 :             gen = vmode == V8QImode ? gen_mmx_punpcklbw
   24958              :                                     : gen_mmx_punpcklwd;
   24959            4 :           nelt2 /= 2;
   24960              : 
   24961            4 :           dest = gen_reg_rtx (vmode);
   24962            4 :           emit_insn (gen (dest, op0, op0));
   24963            4 :           vmode = get_mode_wider_vector (vmode);
   24964            4 :           op0 = gen_lowpart (vmode, dest);
   24965              :         }
   24966            4 :       while (vmode != V2SImode);
   24967              : 
   24968            2 :       memset (perm2, elt, 2);
   24969            2 :       dest = gen_reg_rtx (vmode);
   24970            2 :       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
   24971            2 :       gcc_assert (ok);
   24972              : 
   24973            2 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24974            2 :       return true;
   24975              : 
   24976         1004 :     case E_V8HImode:
   24977         1004 :     case E_V16QImode:
   24978              :       /* These can be implemented via interleave.  We save one insn by
   24979              :          stopping once we have promoted to V4SImode and then use pshufd.  */
   24980         1004 :       if (d->testing_p)
   24981              :         return true;
   24982         1540 :       do
   24983              :         {
   24984         1540 :           if (elt >= nelt2)
   24985              :             {
   24986           16 :               gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
   24987              :                                        : gen_vec_interleave_highv8hi;
   24988           16 :               elt -= nelt2;
   24989              :             }
   24990              :           else
   24991         1524 :             gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
   24992              :                                      : gen_vec_interleave_lowv8hi;
   24993         1540 :           nelt2 /= 2;
   24994              : 
   24995         1540 :           dest = gen_reg_rtx (vmode);
   24996         1540 :           emit_insn (gen (dest, op0, op0));
   24997         1540 :           vmode = get_mode_wider_vector (vmode);
   24998         1540 :           op0 = gen_lowpart (vmode, dest);
   24999              :         }
   25000         1540 :       while (vmode != V4SImode);
   25001              : 
   25002          940 :       memset (perm2, elt, 4);
   25003          940 :       dest = gen_reg_rtx (vmode);
   25004          940 :       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
   25005          940 :       gcc_assert (ok);
   25006              : 
   25007          940 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   25008          940 :       return true;
   25009              : 
   25010            1 :     case E_V8HFmode:
   25011            1 :     case E_V8BFmode:
   25012              :       /* This can be implemented via interleave and pshufd.  */
   25013            1 :       if (d->testing_p)
   25014              :         return true;
   25015              : 
   25016            1 :       rtx (*gen_interleave) (machine_mode, rtx, rtx, rtx);
   25017            1 :       if (elt >= nelt2)
   25018              :         {
   25019            0 :           gen_interleave = gen_vec_interleave_high;
   25020            0 :           elt -= nelt2;
   25021              :         }
   25022              :       else
   25023              :         gen_interleave = gen_vec_interleave_low;
   25024            1 :       nelt2 /= 2;
   25025              : 
   25026            1 :       dest = gen_reg_rtx (vmode);
   25027            1 :       emit_insn (gen_interleave (vmode, dest, op0, op0));
   25028              : 
   25029            1 :       vmode = V4SImode;
   25030            1 :       op0 = gen_lowpart (vmode, dest);
   25031              : 
   25032            1 :       memset (perm2, elt, 4);
   25033            1 :       dest = gen_reg_rtx (vmode);
   25034            1 :       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
   25035            1 :       gcc_assert (ok);
   25036              : 
   25037            1 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   25038            1 :       return true;
   25039              : 
   25040            0 :     case E_V32QImode:
   25041            0 :     case E_V16HImode:
   25042            0 :     case E_V8SImode:
   25043            0 :     case E_V4DImode:
   25044              :       /* For AVX2 broadcasts of the first element vpbroadcast* or
   25045              :          vpermq should be used by expand_vec_perm_1.  */
   25046            0 :       gcc_assert (!TARGET_AVX2 || d->perm[0]);
   25047              :       return false;
   25048              : 
   25049            6 :     case E_V64QImode:
   25050            6 :       gcc_assert (!TARGET_AVX512BW || d->perm[0]);
   25051              :       return false;
   25052              : 
   25053            2 :     case E_V32HImode:
   25054            2 :       gcc_assert (!TARGET_AVX512BW);
   25055              :       return false;
   25056              : 
   25057            0 :     default:
   25058            0 :       gcc_unreachable ();
   25059              :     }
   25060              : }
   25061              : 
   25062              : /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
   25063              :    broadcast permutations.  */
   25064              : 
   25065              : static bool
   25066        89495 : expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
   25067              : {
   25068        89495 :   unsigned i, elt, nelt = d->nelt;
   25069              : 
   25070        89495 :   if (!d->one_operand_p)
   25071              :     return false;
   25072              : 
   25073         5414 :   elt = d->perm[0];
   25074         8285 :   for (i = 1; i < nelt; ++i)
   25075         8177 :     if (d->perm[i] != elt)
   25076              :       return false;
   25077              : 
   25078          108 :   return expand_vec_perm_broadcast_1 (d);
   25079              : }
   25080              : 
   25081              : /* Implement arbitrary permutations of two V64QImode operands
   25082              :    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
   25083              : static bool
   25084        23888 : expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
   25085              : {
   25086        23888 :   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
   25087              :     return false;
   25088              : 
   25089           49 :   if (d->testing_p)
   25090              :     return true;
   25091              : 
   25092           49 :   struct expand_vec_perm_d ds[2];
   25093           49 :   rtx rperm[128], vperm, target0, target1;
   25094           49 :   unsigned int i, nelt;
   25095           49 :   machine_mode vmode;
   25096              : 
   25097           49 :   nelt = d->nelt;
   25098           49 :   vmode = V64QImode;
   25099              : 
   25100          147 :   for (i = 0; i < 2; i++)
   25101              :     {
   25102           98 :       ds[i] = *d;
   25103           98 :       ds[i].vmode = V32HImode;
   25104           98 :       ds[i].nelt = 32;
   25105           98 :       ds[i].target = gen_reg_rtx (V32HImode);
   25106           98 :       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
   25107           98 :       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
   25108              :     }
   25109              : 
   25110              :   /* Prepare permutations such that the first one takes care of
   25111              :      putting the even bytes into the right positions or one higher
   25112              :      positions (ds[0]) and the second one takes care of
   25113              :      putting the odd bytes into the right positions or one below
   25114              :      (ds[1]).  */
   25115              : 
   25116         3185 :   for (i = 0; i < nelt; i++)
   25117              :     {
   25118         3136 :       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
   25119         3136 :       if (i & 1)
   25120              :         {
   25121         1568 :           rperm[i] = constm1_rtx;
   25122         1568 :           rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
   25123              :         }
   25124              :       else
   25125              :         {
   25126         1568 :           rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
   25127         1568 :           rperm[i + 64] = constm1_rtx;
   25128              :         }
   25129              :     }
   25130              : 
   25131           49 :   bool ok = expand_vec_perm_1 (&ds[0]);
   25132           49 :   gcc_assert (ok);
   25133           49 :   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
   25134              : 
   25135           49 :   ok = expand_vec_perm_1 (&ds[1]);
   25136           49 :   gcc_assert (ok);
   25137           49 :   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
   25138              : 
   25139           49 :   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
   25140           49 :   vperm = force_reg (vmode, vperm);
   25141           49 :   target0 = gen_reg_rtx (V64QImode);
   25142           49 :   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
   25143              : 
   25144           49 :   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
   25145           49 :   vperm = force_reg (vmode, vperm);
   25146           49 :   target1 = gen_reg_rtx (V64QImode);
   25147           49 :   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
   25148              : 
   25149           49 :   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
   25150           49 :   return true;
   25151              : }
   25152              : 
   25153              : /* Implement arbitrary permutation of two V32QImode and V16QImode operands
   25154              :    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
   25155              :    all the shorter instruction sequences.  */
   25156              : 
   25157              : static bool
   25158        15693 : expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
   25159              : {
   25160        15693 :   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
   25161        15693 :   unsigned int i, nelt, eltsz;
   25162        15693 :   bool used[4];
   25163              : 
   25164        15693 :   if (!TARGET_AVX2
   25165          318 :       || d->one_operand_p
   25166          189 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   25167              :     return false;
   25168              : 
   25169           54 :   if (d->testing_p)
   25170              :     return true;
   25171              : 
   25172           54 :   nelt = d->nelt;
   25173           54 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   25174              : 
   25175              :   /* Generate 4 permutation masks.  If the required element is within
   25176              :      the same lane, it is shuffled in.  If the required element from the
   25177              :      other lane, force a zero by setting bit 7 in the permutation mask.
   25178              :      In the other mask the mask has non-negative elements if element
   25179              :      is requested from the other lane, but also moved to the other lane,
   25180              :      so that the result of vpshufb can have the two V2TImode halves
   25181              :      swapped.  */
   25182           54 :   m128 = GEN_INT (-128);
   25183         1836 :   for (i = 0; i < 32; ++i)
   25184              :     {
   25185         1728 :       rperm[0][i] = m128;
   25186         1728 :       rperm[1][i] = m128;
   25187         1728 :       rperm[2][i] = m128;
   25188         1728 :       rperm[3][i] = m128;
   25189              :     }
   25190           54 :   used[0] = false;
   25191           54 :   used[1] = false;
   25192           54 :   used[2] = false;
   25193           54 :   used[3] = false;
   25194         1590 :   for (i = 0; i < nelt; ++i)
   25195              :     {
   25196         1536 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   25197         1536 :       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
   25198         2074 :       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
   25199              : 
   25200         3264 :       for (j = 0; j < eltsz; ++j)
   25201         1728 :         rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
   25202         1536 :       used[which] = true;
   25203              :     }
   25204              : 
   25205          162 :   for (i = 0; i < 2; ++i)
   25206              :     {
   25207          108 :       if (!used[2 * i + 1])
   25208              :         {
   25209           22 :           h[i] = NULL_RTX;
   25210           22 :           continue;
   25211              :         }
   25212           86 :       vperm = gen_rtx_CONST_VECTOR (V32QImode,
   25213           86 :                                     gen_rtvec_v (32, rperm[2 * i + 1]));
   25214           86 :       vperm = force_reg (V32QImode, vperm);
   25215           86 :       h[i] = gen_reg_rtx (V32QImode);
   25216           86 :       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
   25217           86 :       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
   25218              :     }
   25219              : 
   25220              :   /* Swap the 128-byte lanes of h[X].  */
   25221          162 :   for (i = 0; i < 2; ++i)
   25222              :    {
   25223          108 :      if (h[i] == NULL_RTX)
   25224           22 :        continue;
   25225           86 :      op = gen_reg_rtx (V4DImode);
   25226           86 :      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
   25227              :                                      const2_rtx, GEN_INT (3), const0_rtx,
   25228              :                                      const1_rtx));
   25229           86 :      h[i] = gen_lowpart (V32QImode, op);
   25230              :    }
   25231              : 
   25232          162 :   for (i = 0; i < 2; ++i)
   25233              :     {
   25234          108 :       if (!used[2 * i])
   25235              :         {
   25236            0 :           l[i] = NULL_RTX;
   25237            0 :           continue;
   25238              :         }
   25239          108 :       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
   25240          108 :       vperm = force_reg (V32QImode, vperm);
   25241          108 :       l[i] = gen_reg_rtx (V32QImode);
   25242          108 :       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
   25243          108 :       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
   25244              :     }
   25245              : 
   25246          162 :   for (i = 0; i < 2; ++i)
   25247              :     {
   25248          108 :       if (h[i] && l[i])
   25249              :         {
   25250           86 :           op = gen_reg_rtx (V32QImode);
   25251           86 :           emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
   25252           86 :           l[i] = op;
   25253              :         }
   25254           22 :       else if (h[i])
   25255            0 :         l[i] = h[i];
   25256              :     }
   25257              : 
   25258           54 :   gcc_assert (l[0] && l[1]);
   25259           54 :   op = d->target;
   25260           54 :   if (d->vmode != V32QImode)
   25261           12 :     op = gen_reg_rtx (V32QImode);
   25262           54 :   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
   25263           54 :   if (op != d->target)
   25264           12 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   25265              :   return true;
   25266              : }
   25267              : 
   25268              : /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
   25269              :    taken care of, perform the expansion in D and return true on success.  */
   25270              : 
   25271              : static bool
   25272       332160 : ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   25273              : {
   25274              :   /* Try a single instruction expansion.  */
   25275       332160 :   if (expand_vec_perm_1 (d))
   25276              :     return true;
   25277              : 
   25278              :   /* Try sequences of two instructions.  */
   25279              : 
   25280       101362 :   if (expand_vec_perm_pshuflw_pshufhw (d))
   25281              :     return true;
   25282              : 
   25283        98899 :   if (expand_vec_perm_palignr (d, false))
   25284              :     return true;
   25285              : 
   25286        95764 :   if (expand_vec_perm_interleave2 (d))
   25287              :     return true;
   25288              : 
   25289        89495 :   if (expand_vec_perm_broadcast (d))
   25290              :     return true;
   25291              : 
   25292        89395 :   if (expand_vec_perm_vpermq_perm_1 (d))
   25293              :     return true;
   25294              : 
   25295        89395 :   if (expand_vec_perm_vperm2f128 (d))
   25296              :     return true;
   25297              : 
   25298        89327 :   if (expand_vec_perm_pblendv (d))
   25299              :     return true;
   25300              : 
   25301        87664 :   if (expand_vec_perm_2perm_interleave (d, true))
   25302              :     return true;
   25303              : 
   25304        87302 :   if (expand_vec_perm_2perm_pblendv (d, true))
   25305              :     return true;
   25306              : 
   25307        84421 :   if (expand_vec_perm_shufps_shufps (d))
   25308              :     return true;
   25309              : 
   25310        49667 :   if (expand_vec_perm_punpckldq_pshuf (d))
   25311              :     return true;
   25312              : 
   25313              :   /* Try sequences of three instructions.  */
   25314              : 
   25315        43985 :   if (expand_vec_perm_even_odd_pack (d))
   25316              :     return true;
   25317              : 
   25318        30181 :   if (expand_vec_perm_2vperm2f128_vshuf (d))
   25319              :     return true;
   25320              : 
   25321        28968 :   if (expand_vec_perm_pshufb2 (d))
   25322              :     return true;
   25323              : 
   25324        27580 :   if (expand_vec_perm_pslldq_psrldq_por (d, false))
   25325              :     return true;
   25326              : 
   25327        27337 :   if (expand_vec_perm_interleave3 (d))
   25328              :     return true;
   25329              : 
   25330        27199 :   if (expand_vec_perm_vperm2f128_vblend (d))
   25331              :     return true;
   25332              : 
   25333        27199 :   if (expand_vec_perm_2perm_interleave (d, false))
   25334              :     return true;
   25335              : 
   25336        26959 :   if (expand_vec_perm_2perm_pblendv (d, false))
   25337              :     return true;
   25338              : 
   25339        26415 :   if (expand_vec_perm_psrlw_psllw_por (d))
   25340              :     return true;
   25341              : 
   25342        24977 :   if (expand_vec_perm_pand_pandn_por (d))
   25343              :     return true;
   25344              : 
   25345              :   /* Try sequences of four instructions.  */
   25346              : 
   25347        23907 :   if (expand_vec_perm_even_odd_trunc (d))
   25348              :     return true;
   25349        23895 :   if (expand_vec_perm_vpshufb2_vpermq (d))
   25350              :     return true;
   25351              : 
   25352        23888 :   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
   25353              :     return true;
   25354              : 
   25355        23888 :   if (expand_vec_perm_vpermt2_vpshub2 (d))
   25356              :     return true;
   25357              : 
   25358              :   /* ??? Look for narrow permutations whose element orderings would
   25359              :      allow the promotion to a wider mode.  */
   25360              : 
   25361              :   /* ??? Look for sequences of interleave or a wider permute that place
   25362              :      the data into the correct lanes for a half-vector shuffle like
   25363              :      pshuf[lh]w or vpermilps.  */
   25364              : 
   25365              :   /* ??? Look for sequences of interleave that produce the desired results.
   25366              :      The combinatorics of punpck[lh] get pretty ugly... */
   25367              : 
   25368        23839 :   if (expand_vec_perm_even_odd (d))
   25369              :     return true;
   25370              : 
   25371              :   /* Generate four or five instructions.  */
   25372        15973 :   if (expand_vec_perm_pslldq_psrldq_por (d, true))
   25373              :     return true;
   25374              : 
   25375              :   /* Even longer sequences.  */
   25376        15693 :   if (expand_vec_perm_vpshufb4_vpermq2 (d))
   25377              :     return true;
   25378              : 
   25379              :   /* See if we can get the same permutation in different vector integer
   25380              :      mode.  */
   25381        15639 :   struct expand_vec_perm_d nd;
   25382        15639 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   25383              :     {
   25384            0 :       if (!d->testing_p)
   25385            0 :         emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   25386            0 :       return true;
   25387              :     }
   25388              : 
   25389              :   /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
   25390        15639 :   if (expand_vec_perm2_vperm2f128_vblend (d))
   25391              :     return true;
   25392              : 
   25393              :   return false;
   25394              : }
   25395              : 
   25396              : /* If a permutation only uses one operand, make it clear. Returns true
   25397              :    if the permutation references both operands.  */
   25398              : 
   25399              : static bool
   25400        74080 : canonicalize_perm (struct expand_vec_perm_d *d)
   25401              : {
   25402        74080 :   int i, which, nelt = d->nelt;
   25403              : 
   25404       445068 :   for (i = which = 0; i < nelt; ++i)
   25405       504329 :     which |= (d->perm[i] < nelt ? 1 : 2);
   25406              : 
   25407        74080 :   d->one_operand_p = true;
   25408        74080 :   switch (which)
   25409              :     {
   25410            0 :     default:
   25411            0 :       gcc_unreachable();
   25412              : 
   25413        55069 :     case 3:
   25414        55069 :       if (!rtx_equal_p (d->op0, d->op1))
   25415              :         {
   25416        55018 :           d->one_operand_p = false;
   25417        55018 :           break;
   25418              :         }
   25419              :       /* The elements of PERM do not suggest that only the first operand
   25420              :          is used, but both operands are identical.  Allow easier matching
   25421              :          of the permutation by folding the permutation into the single
   25422              :          input vector.  */
   25423              :       /* FALLTHRU */
   25424              : 
   25425              :     case 2:
   25426         2913 :       for (i = 0; i < nelt; ++i)
   25427         2576 :         d->perm[i] &= nelt - 1;
   25428          337 :       d->op0 = d->op1;
   25429          337 :       break;
   25430              : 
   25431        18725 :     case 1:
   25432        18725 :       d->op1 = d->op0;
   25433        18725 :       break;
   25434              :     }
   25435              : 
   25436        74080 :   return (which == 3);
   25437              : }
   25438              : 
   25439              : /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
   25440              : 
   25441              : bool
   25442       863027 : ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   25443              :                                rtx target, rtx op0, rtx op1,
   25444              :                                const vec_perm_indices &sel)
   25445              : {
   25446       863027 :   if (vmode != op_mode)
   25447              :     return false;
   25448              : 
   25449       861204 :   struct expand_vec_perm_d d;
   25450       861204 :   unsigned char perm[MAX_VECT_LEN];
   25451       861204 :   unsigned int i, nelt, which;
   25452       861204 :   bool two_args;
   25453              : 
   25454              :   /* For HF and BF mode vector, convert it to HI using subreg.  */
   25455      2582714 :   if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
   25456              :     {
   25457          924 :       machine_mode orig_mode = vmode;
   25458         1848 :       vmode = mode_for_vector (HImode,
   25459          924 :                                GET_MODE_NUNITS (vmode)).require ();
   25460          924 :       if (target)
   25461          437 :         target = lowpart_subreg (vmode, target, orig_mode);
   25462          924 :       if (op0)
   25463          437 :         op0 = lowpart_subreg (vmode, op0, orig_mode);
   25464          924 :       if (op1)
   25465          437 :         op1 = lowpart_subreg (vmode, op1, orig_mode);
   25466              :     }
   25467              : 
   25468       861204 :   d.target = target;
   25469       861204 :   d.op0 = op0;
   25470       861204 :   d.op1 = op1;
   25471              : 
   25472       861204 :   d.vmode = vmode;
   25473       861204 :   gcc_assert (VECTOR_MODE_P (d.vmode));
   25474       861204 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25475       861204 :   d.testing_p = !target;
   25476              : 
   25477       861204 :   gcc_assert (sel.length () == nelt);
   25478       861204 :   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
   25479              : 
   25480              :   /* Given sufficient ISA support we can just return true here
   25481              :      for selected vector modes.  */
   25482       861204 :   switch (d.vmode)
   25483              :     {
   25484         1733 :     case E_V16SFmode:
   25485         1733 :     case E_V16SImode:
   25486         1733 :     case E_V8DImode:
   25487         1733 :     case E_V8DFmode:
   25488         1733 :       if (!TARGET_AVX512F)
   25489              :         return false;
   25490              :       /* All implementable with a single vperm[it]2 insn.  */
   25491         1733 :       if (d.testing_p)
   25492              :         return true;
   25493              :       break;
   25494          323 :     case E_V32HImode:
   25495          323 :       if (!TARGET_AVX512F)
   25496              :         return false;
   25497          323 :       if (d.testing_p && TARGET_AVX512BW)
   25498              :         /* All implementable with a single vperm[it]2 insn.  */
   25499              :         return true;
   25500              :       break;
   25501          747 :     case E_V64QImode:
   25502          747 :       if (!TARGET_AVX512F)
   25503              :         return false;
   25504          747 :       if (d.testing_p && TARGET_AVX512BW)
   25505              :         /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
   25506              :         return true;
   25507              :       break;
   25508        11325 :     case E_V8SImode:
   25509        11325 :     case E_V8SFmode:
   25510        11325 :     case E_V4DFmode:
   25511        11325 :     case E_V4DImode:
   25512        11325 :       if (!TARGET_AVX)
   25513              :         return false;
   25514        11325 :       if (d.testing_p && TARGET_AVX512VL)
   25515              :         /* All implementable with a single vperm[it]2 insn.  */
   25516              :         return true;
   25517              :       break;
   25518          614 :     case E_V16HImode:
   25519          614 :       if (!TARGET_SSE2)
   25520              :         return false;
   25521          614 :       if (d.testing_p && TARGET_AVX2)
   25522              :         /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
   25523              :         return true;
   25524              :       break;
   25525          693 :     case E_V32QImode:
   25526          693 :       if (!TARGET_SSE2)
   25527              :         return false;
   25528          693 :       if (d.testing_p && TARGET_AVX2)
   25529              :         /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
   25530              :         return true;
   25531              :       break;
   25532        38046 :     case E_V8HImode:
   25533        38046 :     case E_V16QImode:
   25534        38046 :       if (!TARGET_SSE2)
   25535              :         return false;
   25536              :       /* Fall through.  */
   25537       235810 :     case E_V4SImode:
   25538       235810 :     case E_V4SFmode:
   25539       235810 :       if (!TARGET_SSE)
   25540              :         return false;
   25541              :       /* All implementable with a single vpperm insn.  */
   25542       235810 :       if (d.testing_p && TARGET_XOP)
   25543              :         return true;
   25544              :       /* All implementable with 2 pshufb + 1 ior.  */
   25545       235704 :       if (d.testing_p && TARGET_SSSE3)
   25546              :         return true;
   25547              :       break;
   25548       177050 :     case E_V2SFmode:
   25549       177050 :     case E_V2SImode:
   25550       177050 :     case E_V4HImode:
   25551       177050 :     case E_V8QImode:
   25552       177050 :       if (!TARGET_MMX_WITH_SSE)
   25553              :         return false;
   25554              :       break;
   25555        27268 :     case E_V2HImode:
   25556        27268 :       if (!TARGET_SSE2)
   25557              :         return false;
   25558              :       /* All implementable with *punpckwd.  */
   25559        27268 :       if (d.testing_p)
   25560              :         return true;
   25561              :       break;
   25562        11848 :     case E_V4QImode:
   25563        11848 :       if (!TARGET_SSE2)
   25564              :         return false;
   25565              :       break;
   25566       391897 :     case E_V2DImode:
   25567       391897 :     case E_V2DFmode:
   25568       391897 :       if (!TARGET_SSE)
   25569              :         return false;
   25570              :       /* All implementable with shufpd or unpck[lh]pd.  */
   25571       391897 :       if (d.testing_p)
   25572              :         return true;
   25573              :       break;
   25574              :     default:
   25575              :       return false;
   25576              :     }
   25577              : 
   25578      2339760 :   for (i = which = 0; i < nelt; ++i)
   25579              :     {
   25580      1897200 :       unsigned char e = sel[i];
   25581      1897200 :       gcc_assert (e < 2 * nelt);
   25582      1897200 :       d.perm[i] = e;
   25583      1897200 :       perm[i] = e;
   25584      2569615 :       which |= (e < nelt ? 1 : 2);
   25585              :     }
   25586              : 
   25587       442560 :   if (d.testing_p)
   25588              :     {
   25589              :       /* For all elements from second vector, fold the elements to first.  */
   25590       369712 :       if (which == 2)
   25591         1375 :         for (i = 0; i < nelt; ++i)
   25592         1260 :           d.perm[i] -= nelt;
   25593              : 
   25594              :       /* Check whether the mask can be applied to the vector type.  */
   25595       369712 :       d.one_operand_p = (which != 3);
   25596              : 
   25597              :       /* Implementable with shufps, pshufd or pshuflw.  */
   25598       369712 :       if (d.one_operand_p
   25599              :           && (d.vmode == V4SFmode || d.vmode == V2SFmode
   25600              :               || d.vmode == V4SImode || d.vmode == V2SImode
   25601              :               || d.vmode == V4HImode || d.vmode == V2HImode))
   25602              :         return true;
   25603              : 
   25604              :       /* Otherwise we have to go through the motions and see if we can
   25605              :          figure out how to generate the requested permutation.  */
   25606       255453 :       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
   25607       255453 :       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
   25608       255453 :       if (!d.one_operand_p)
   25609       241428 :         d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
   25610              : 
   25611       255453 :       start_sequence ();
   25612       255453 :       bool ret = ix86_expand_vec_perm_const_1 (&d);
   25613       255453 :       end_sequence ();
   25614              : 
   25615       255453 :       return ret;
   25616              :     }
   25617              : 
   25618        72848 :   two_args = canonicalize_perm (&d);
   25619              : 
   25620              :   /* If one of the operands is a zero vector, try to match pmovzx.  */
   25621        72848 :   if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
   25622              :     {
   25623          583 :       struct expand_vec_perm_d dzero = d;
   25624          583 :       if (d.op0 == CONST0_RTX (vmode))
   25625              :         {
   25626          387 :           d.op1 = dzero.op1 = force_reg (vmode, d.op1);
   25627          387 :           std::swap (dzero.op0, dzero.op1);
   25628         7527 :           for (i = 0; i < nelt; ++i)
   25629         7140 :             dzero.perm[i] ^= nelt;
   25630              :         }
   25631              :       else
   25632          196 :         d.op0 = dzero.op0 = force_reg (vmode, d.op0);
   25633              : 
   25634          583 :       if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
   25635          583 :                                   dzero.perm, nelt, dzero.testing_p))
   25636          122 :         return true;
   25637              :     }
   25638              : 
   25639              :   /* Force operands into registers.  */
   25640        72726 :   rtx nop0 = force_reg (vmode, d.op0);
   25641        72726 :   if (d.op0 == d.op1)
   25642        18661 :     d.op1 = nop0;
   25643        72726 :   d.op0 = nop0;
   25644        72726 :   d.op1 = force_reg (vmode, d.op1);
   25645              : 
   25646        72726 :   if (ix86_expand_vec_perm_const_1 (&d))
   25647              :     return true;
   25648              : 
   25649              :   /* If the selector says both arguments are needed, but the operands are the
   25650              :      same, the above tried to expand with one_operand_p and flattened selector.
   25651              :      If that didn't work, retry without one_operand_p; we succeeded with that
   25652              :      during testing.  */
   25653           22 :   if (two_args && d.one_operand_p)
   25654              :     {
   25655           22 :       d.one_operand_p = false;
   25656           22 :       memcpy (d.perm, perm, sizeof (perm));
   25657           22 :       return ix86_expand_vec_perm_const_1 (&d);
   25658              :     }
   25659              : 
   25660              :   return false;
   25661              : }
   25662              : 
   25663              : void
   25664         8148 : ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
   25665              : {
   25666         8148 :   struct expand_vec_perm_d d;
   25667         8148 :   unsigned i, nelt;
   25668              : 
   25669         8148 :   d.target = targ;
   25670         8148 :   d.op0 = op0;
   25671         8148 :   d.op1 = op1;
   25672         8148 :   d.vmode = GET_MODE (targ);
   25673         8148 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25674         8148 :   d.one_operand_p = false;
   25675         8148 :   d.testing_p = false;
   25676              : 
   25677        77136 :   for (i = 0; i < nelt; ++i)
   25678        68988 :     d.perm[i] = i * 2 + odd;
   25679              : 
   25680              :   /* We'll either be able to implement the permutation directly...  */
   25681         8148 :   if (expand_vec_perm_1 (&d))
   25682         3193 :     return;
   25683              : 
   25684              :   /* ... or we use the special-case patterns.  */
   25685         4955 :   expand_vec_perm_even_odd_1 (&d, odd);
   25686              : }
   25687              : 
   25688              : static void
   25689          922 : ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   25690              : {
   25691          922 :   struct expand_vec_perm_d d;
   25692          922 :   unsigned i, nelt, base;
   25693          922 :   bool ok;
   25694              : 
   25695          922 :   d.target = targ;
   25696          922 :   d.op0 = op0;
   25697          922 :   d.op1 = op1;
   25698          922 :   d.vmode = GET_MODE (targ);
   25699          922 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25700          922 :   d.one_operand_p = false;
   25701          922 :   d.testing_p = false;
   25702              : 
   25703          922 :   base = high_p ? nelt / 2 : 0;
   25704         3642 :   for (i = 0; i < nelt / 2; ++i)
   25705              :     {
   25706         2720 :       d.perm[i * 2] = i + base;
   25707         2720 :       d.perm[i * 2 + 1] = i + base + nelt;
   25708              :     }
   25709              : 
   25710              :   /* Note that for AVX this isn't one instruction.  */
   25711          922 :   ok = ix86_expand_vec_perm_const_1 (&d);
   25712          922 :   gcc_assert (ok);
   25713          922 : }
   25714              : 
   25715              : /* Expand a vector operation shift by constant for a V*QImode in terms of the
   25716              :    same operation on V*HImode. Return true if success. */
   25717              : static bool
   25718          386 : ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
   25719              :                                      rtx dest, rtx op1, rtx op2)
   25720              : {
   25721          386 :   machine_mode qimode, himode;
   25722          386 :   HOST_WIDE_INT and_constant, xor_constant;
   25723          386 :   HOST_WIDE_INT shift_amount;
   25724          386 :   rtx vec_const_and, vec_const_xor;
   25725          386 :   rtx tmp, op1_subreg;
   25726          386 :   rtx (*gen_shift) (rtx, rtx, rtx);
   25727          386 :   rtx (*gen_and) (rtx, rtx, rtx);
   25728          386 :   rtx (*gen_xor) (rtx, rtx, rtx);
   25729          386 :   rtx (*gen_sub) (rtx, rtx, rtx);
   25730              : 
   25731              :   /* Only optimize shift by constant.  */
   25732          386 :   if (!CONST_INT_P (op2))
   25733              :     return false;
   25734              : 
   25735          386 :   qimode = GET_MODE (dest);
   25736          386 :   shift_amount = INTVAL (op2);
   25737              :   /* Do nothing when shift amount greater equal 8.  */
   25738          386 :   if (shift_amount > 7)
   25739              :     return false;
   25740              : 
   25741          386 :   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
   25742              : 
   25743              : 
   25744          386 :   if (shift_amount == 7
   25745          386 :       && code == ASHIFTRT)
   25746              :     {
   25747           40 :       if (qimode == V16QImode
   25748            8 :           || qimode == V32QImode)
   25749              :         {
   25750           39 :           rtx zero = gen_reg_rtx (qimode);
   25751           39 :           emit_move_insn (zero, CONST0_RTX (qimode));
   25752           39 :           emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
   25753           39 :         }
   25754              :       else
   25755              :         {
   25756            1 :           gcc_assert (qimode == V64QImode);
   25757            1 :           rtx kmask = gen_reg_rtx (DImode);
   25758            1 :           emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
   25759            1 :           emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
   25760              :         }
   25761           40 :       return true;
   25762              :     }
   25763              : 
   25764              :   /* Record sign bit.  */
   25765          346 :   xor_constant = 1 << (8 - shift_amount - 1);
   25766              : 
   25767              :   /* Zero upper/lower bits shift from left/right element.  */
   25768          346 :   and_constant
   25769          346 :     = (code == ASHIFT ? 256 - (1 << shift_amount)
   25770          317 :        : (1 << (8 - shift_amount)) - 1);
   25771              : 
   25772          346 :   switch (qimode)
   25773              :     {
   25774          333 :     case V16QImode:
   25775          333 :       himode = V8HImode;
   25776          281 :       gen_shift =
   25777              :         ((code == ASHIFT)
   25778          333 :          ? gen_ashlv8hi3
   25779          313 :          : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
   25780              :       gen_and = gen_andv16qi3;
   25781              :       gen_xor = gen_xorv16qi3;
   25782              :       gen_sub = gen_subv16qi3;
   25783              :       break;
   25784            6 :     case V32QImode:
   25785            6 :       himode = V16HImode;
   25786            1 :       gen_shift =
   25787              :         ((code == ASHIFT)
   25788            6 :          ? gen_ashlv16hi3
   25789            2 :          : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
   25790              :       gen_and = gen_andv32qi3;
   25791              :       gen_xor = gen_xorv32qi3;
   25792              :       gen_sub = gen_subv32qi3;
   25793              :       break;
   25794            7 :     case V64QImode:
   25795            7 :       himode = V32HImode;
   25796            1 :       gen_shift =
   25797              :         ((code == ASHIFT)
   25798            7 :          ? gen_ashlv32hi3
   25799            2 :          : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
   25800              :       gen_and = gen_andv64qi3;
   25801              :       gen_xor = gen_xorv64qi3;
   25802              :       gen_sub = gen_subv64qi3;
   25803              :       break;
   25804            0 :     default:
   25805            0 :       gcc_unreachable ();
   25806              :     }
   25807              : 
   25808          346 :   tmp = gen_reg_rtx (himode);
   25809          346 :   vec_const_and = gen_reg_rtx (qimode);
   25810          346 :   op1_subreg = lowpart_subreg (himode, op1, qimode);
   25811              : 
   25812              :   /* For ASHIFT and LSHIFTRT, perform operation like
   25813              :      vpsllw/vpsrlw $shift_amount, %op1, %dest.
   25814              :      vpand %vec_const_and, %dest.  */
   25815          346 :   emit_insn (gen_shift (tmp, op1_subreg, op2));
   25816          346 :   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
   25817          346 :   emit_move_insn (vec_const_and,
   25818              :                   ix86_build_const_vector (qimode, true,
   25819          346 :                                            gen_int_mode (and_constant, QImode)));
   25820          346 :   emit_insn (gen_and (dest, dest, vec_const_and));
   25821              : 
   25822              :   /* For ASHIFTRT, perform extra operation like
   25823              :      vpxor %vec_const_xor, %dest, %dest
   25824              :      vpsubb %vec_const_xor, %dest, %dest  */
   25825          346 :   if (code == ASHIFTRT)
   25826              :     {
   25827           34 :       vec_const_xor = gen_reg_rtx (qimode);
   25828           34 :       emit_move_insn (vec_const_xor,
   25829              :                       ix86_build_const_vector (qimode, true,
   25830           34 :                                                gen_int_mode (xor_constant, QImode)));
   25831           34 :       emit_insn (gen_xor (dest, dest, vec_const_xor));
   25832           34 :       emit_insn (gen_sub (dest, dest, vec_const_xor));
   25833              :     }
   25834              :   return true;
   25835              : }
   25836              : 
   25837              : void
   25838         1440 : ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   25839              : {
   25840         1440 :   machine_mode qimode = GET_MODE (dest);
   25841         1440 :   rtx qop1, qop2, hop1, hop2, qdest, hdest;
   25842         1440 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   25843         1440 :   bool uns_p = code != ASHIFTRT;
   25844              : 
   25845         1440 :   switch (qimode)
   25846              :     {
   25847         1440 :     case E_V4QImode:
   25848         1440 :     case E_V8QImode:
   25849         1440 :       break;
   25850            0 :     default:
   25851            0 :       gcc_unreachable ();
   25852              :     }
   25853              : 
   25854         1440 :   qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
   25855              : 
   25856         1440 :   if (op2vec)
   25857         1310 :     qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
   25858              :   else
   25859              :     qop2 = op2;
   25860              : 
   25861         1440 :   qdest = gen_reg_rtx (V16QImode);
   25862              : 
   25863         1440 :   if (CONST_INT_P (op2)
   25864          118 :       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
   25865              :       /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb.
   25866              :          Even with SSE4.1 the alternative is better.  */
   25867          118 :       && !TARGET_SSE4_1
   25868         1494 :       && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
   25869              :     {
   25870           54 :       emit_move_insn (dest, gen_lowpart (qimode, qdest));
   25871           54 :       return;
   25872              :     }
   25873              : 
   25874         1386 :   if (CONST_INT_P (op2)
   25875           64 :       && code == ASHIFTRT
   25876           14 :       && INTVAL (op2) == 7)
   25877              :     {
   25878            4 :       rtx zero = gen_reg_rtx (qimode);
   25879            4 :       emit_move_insn (zero, CONST0_RTX (qimode));
   25880            4 :       emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
   25881            4 :       return;
   25882              :     }
   25883              : 
   25884         1382 :   switch (code)
   25885              :     {
   25886         1297 :     case MULT:
   25887         1297 :       gcc_assert (op2vec);
   25888         1297 :       if (!TARGET_SSE4_1)
   25889              :         {
   25890              :           /* Unpack data such that we've got a source byte in each low byte
   25891              :              of each word.  We don't care what goes into the high byte of
   25892              :              each word.  Rather than trying to get zero in there, most
   25893              :              convenient is to let it be a copy of the low byte.  */
   25894          244 :           hop1 = copy_to_reg (qop1);
   25895          244 :           hop2 = copy_to_reg (qop2);
   25896          244 :           emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
   25897          244 :           emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
   25898          244 :           break;
   25899              :         }
   25900              :       /* FALLTHRU */
   25901         1138 :     case ASHIFT:
   25902         1138 :     case ASHIFTRT:
   25903         1138 :     case LSHIFTRT:
   25904         1138 :       hop1 = gen_reg_rtx (V8HImode);
   25905         1138 :       ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
   25906              :       /* mult/vashr/vlshr/vashl  */
   25907         1138 :       if (op2vec)
   25908              :         {
   25909         1066 :           hop2 = gen_reg_rtx (V8HImode);
   25910         1066 :           ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
   25911              :         }
   25912              :       else
   25913              :         hop2 = qop2;
   25914              : 
   25915              :       break;
   25916            0 :     default:
   25917            0 :       gcc_unreachable ();
   25918              :     }
   25919              : 
   25920         1382 :   if (code != MULT && op2vec)
   25921              :     {
   25922              :       /* Expand vashr/vlshr/vashl.  */
   25923           13 :       hdest = gen_reg_rtx (V8HImode);
   25924           13 :       emit_insn (gen_rtx_SET (hdest,
   25925              :                               simplify_gen_binary (code, V8HImode,
   25926              :                                                    hop1, hop2)));
   25927              :     }
   25928              :   else
   25929              :     /* Expand mult/ashr/lshr/ashl.  */
   25930         1369 :     hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
   25931              :                                 NULL_RTX, 1, OPTAB_DIRECT);
   25932              : 
   25933         1382 :   if (TARGET_AVX512BW && TARGET_AVX512VL)
   25934              :     {
   25935           57 :       if (qimode == V8QImode)
   25936              :         qdest = dest;
   25937              :       else
   25938           10 :         qdest = gen_reg_rtx (V8QImode);
   25939              : 
   25940           57 :       emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
   25941              :     }
   25942              :   else
   25943              :     {
   25944         1325 :       struct expand_vec_perm_d d;
   25945         1325 :       rtx qres = gen_lowpart (V16QImode, hdest);
   25946         1325 :       bool ok;
   25947         1325 :       int i;
   25948              : 
   25949              :       /* Merge the data back into the right place.  */
   25950         1325 :       d.target = qdest;
   25951         1325 :       d.op0 = d.op1 = qres;
   25952         1325 :       d.vmode = V16QImode;
   25953         1325 :       d.nelt = 16;
   25954         1325 :       d.one_operand_p = TARGET_SSSE3;
   25955         1325 :       d.testing_p = false;
   25956              : 
   25957        22525 :       for (i = 0; i < d.nelt; ++i)
   25958        21200 :         d.perm[i] = i * 2;
   25959              : 
   25960         1325 :       ok = ix86_expand_vec_perm_const_1 (&d);
   25961         1325 :       gcc_assert (ok);
   25962              :     }
   25963              : 
   25964         1382 :   if (qdest != dest)
   25965         1335 :     emit_move_insn (dest, gen_lowpart (qimode, qdest));
   25966              : }
   25967              : 
   25968              : /* Emit instruction in 2x wider mode.  For example, optimize
   25969              :    vector MUL generation like
   25970              : 
   25971              :    vpmovzxbw ymm2, xmm0
   25972              :    vpmovzxbw ymm3, xmm1
   25973              :    vpmullw   ymm4, ymm2, ymm3
   25974              :    vpmovwb   xmm0, ymm4
   25975              : 
   25976              :    it would take less instructions than ix86_expand_vecop_qihi.
   25977              :    Return true if success.  */
   25978              : 
   25979              : static bool
   25980         1155 : ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   25981              : {
   25982         1155 :   machine_mode himode, qimode = GET_MODE (dest);
   25983         1155 :   machine_mode wqimode;
   25984         1155 :   rtx qop1, qop2, hop1, hop2, hdest;
   25985         1155 :   rtx (*gen_truncate)(rtx, rtx) = NULL;
   25986         1155 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   25987         1155 :   bool uns_p = code != ASHIFTRT;
   25988              : 
   25989              :   /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
   25990              :      generic permutation to merge the data back into the right place.  This
   25991              :      permutation results in VPERMQ, which is slow, so better fall back to
   25992              :      ix86_expand_vecop_qihi.  */
   25993         1155 :   if (!TARGET_AVX512BW
   25994          327 :       || (qimode == V16QImode && !TARGET_AVX512VL)
   25995              :       /* There are no V64HImode instructions.  */
   25996          327 :       || qimode == V64QImode)
   25997              :      return false;
   25998              : 
   25999              :   /* Do not generate ymm/zmm instructions when
   26000              :      target prefers 128/256 bit vector width.  */
   26001          317 :   if ((qimode == V16QImode && TARGET_PREFER_AVX128)
   26002          317 :       || (qimode == V32QImode && TARGET_PREFER_AVX256))
   26003              :     return false;
   26004              : 
   26005          312 :   switch (qimode)
   26006              :     {
   26007              :     case E_V16QImode:
   26008              :       himode = V16HImode;
   26009              :       gen_truncate = gen_truncv16hiv16qi2;
   26010              :       break;
   26011           57 :     case E_V32QImode:
   26012           57 :       himode = V32HImode;
   26013           57 :       gen_truncate = gen_truncv32hiv32qi2;
   26014           57 :       break;
   26015            0 :     default:
   26016            0 :       gcc_unreachable ();
   26017              :     }
   26018              : 
   26019          312 :   wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
   26020          312 :   qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
   26021              : 
   26022          312 :   if (op2vec)
   26023          312 :     qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
   26024              :   else
   26025              :     qop2 = op2;
   26026              : 
   26027          312 :   hop1 = gen_reg_rtx (himode);
   26028          312 :   ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
   26029              : 
   26030          312 :   if (op2vec)
   26031              :     {
   26032          312 :       hop2 = gen_reg_rtx (himode);
   26033          312 :       ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
   26034              :     }
   26035              :   else
   26036              :     hop2 = qop2;
   26037              : 
   26038          312 :   if (code != MULT && op2vec)
   26039              :     {
   26040              :       /* Expand vashr/vlshr/vashl.  */
   26041           14 :       hdest = gen_reg_rtx (himode);
   26042           14 :       emit_insn (gen_rtx_SET (hdest,
   26043              :                               simplify_gen_binary (code, himode,
   26044              :                                                    hop1, hop2)));
   26045              :     }
   26046              :   else
   26047              :     /* Expand mult/ashr/lshr/ashl.  */
   26048          298 :     hdest = expand_simple_binop (himode, code, hop1, hop2,
   26049              :                                  NULL_RTX, 1, OPTAB_DIRECT);
   26050              : 
   26051          312 :   emit_insn (gen_truncate (dest, hdest));
   26052          312 :   return true;
   26053              : }
   26054              : 
   26055              : /* Expand a vector operation CODE for a V*QImode in terms of the
   26056              :    same operation on V*HImode.  */
   26057              : 
   26058              : void
   26059         1487 : ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   26060              : {
   26061         1487 :   machine_mode qimode = GET_MODE (dest);
   26062         1487 :   machine_mode himode;
   26063         1487 :   rtx (*gen_il) (rtx, rtx, rtx);
   26064         1487 :   rtx (*gen_ih) (rtx, rtx, rtx);
   26065         1487 :   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
   26066         1487 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   26067         1487 :   struct expand_vec_perm_d d;
   26068         1487 :   bool full_interleave = true;
   26069         1487 :   bool uns_p = code != ASHIFTRT;
   26070         1487 :   bool ok;
   26071         1487 :   int i;
   26072              : 
   26073         1487 :   if (CONST_INT_P (op2)
   26074          332 :       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
   26075         1819 :       && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
   26076          644 :     return;
   26077              : 
   26078         1155 :   if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
   26079              :     return;
   26080              : 
   26081          843 :   switch (qimode)
   26082              :     {
   26083              :     case E_V16QImode:
   26084              :       himode = V8HImode;
   26085              :       break;
   26086           44 :     case E_V32QImode:
   26087           44 :       himode = V16HImode;
   26088           44 :       break;
   26089           10 :     case E_V64QImode:
   26090           10 :       himode = V32HImode;
   26091           10 :       break;
   26092            0 :     default:
   26093            0 :       gcc_unreachable ();
   26094              :     }
   26095              : 
   26096          843 :   switch (code)
   26097              :     {
   26098          816 :     case MULT:
   26099          816 :       gcc_assert (op2vec);
   26100              :       /* Unpack data such that we've got a source byte in each low byte of
   26101              :          each word.  We don't care what goes into the high byte of each word.
   26102              :          Rather than trying to get zero in there, most convenient is to let
   26103              :          it be a copy of the low byte.  */
   26104          816 :       switch (qimode)
   26105              :         {
   26106              :         case E_V16QImode:
   26107              :           gen_il = gen_vec_interleave_lowv16qi;
   26108              :           gen_ih = gen_vec_interleave_highv16qi;
   26109              :           break;
   26110           44 :         case E_V32QImode:
   26111           44 :           gen_il = gen_avx2_interleave_lowv32qi;
   26112           44 :           gen_ih = gen_avx2_interleave_highv32qi;
   26113           44 :           full_interleave = false;
   26114           44 :           break;
   26115            8 :         case E_V64QImode:
   26116            8 :           gen_il = gen_avx512bw_interleave_lowv64qi;
   26117            8 :           gen_ih = gen_avx512bw_interleave_highv64qi;
   26118            8 :           full_interleave = false;
   26119            8 :           break;
   26120            0 :         default:
   26121            0 :           gcc_unreachable ();
   26122              :         }
   26123              : 
   26124          816 :       op2_l = gen_reg_rtx (qimode);
   26125          816 :       op2_h = gen_reg_rtx (qimode);
   26126          816 :       emit_insn (gen_il (op2_l, op2, op2));
   26127          816 :       emit_insn (gen_ih (op2_h, op2, op2));
   26128              : 
   26129          816 :       op1_l = gen_reg_rtx (qimode);
   26130          816 :       op1_h = gen_reg_rtx (qimode);
   26131          816 :       emit_insn (gen_il (op1_l, op1, op1));
   26132          816 :       emit_insn (gen_ih (op1_h, op1, op1));
   26133          816 :       break;
   26134              : 
   26135           27 :     case ASHIFT:
   26136           27 :     case ASHIFTRT:
   26137           27 :     case LSHIFTRT:
   26138           27 :       op1_l = gen_reg_rtx (himode);
   26139           27 :       op1_h = gen_reg_rtx (himode);
   26140           27 :       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
   26141           27 :       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
   26142              :       /* vashr/vlshr/vashl  */
   26143           27 :       if (op2vec)
   26144              :         {
   26145            2 :           rtx tmp = force_reg (qimode, op2);
   26146            2 :           op2_l = gen_reg_rtx (himode);
   26147            2 :           op2_h = gen_reg_rtx (himode);
   26148            2 :           ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
   26149            2 :           ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
   26150              :         }
   26151              :       else
   26152              :         op2_l = op2_h = op2;
   26153              : 
   26154              :       break;
   26155            0 :     default:
   26156            0 :       gcc_unreachable ();
   26157              :     }
   26158              : 
   26159          843 :   if (code != MULT && op2vec)
   26160              :     {
   26161              :       /* Expand vashr/vlshr/vashl.  */
   26162            2 :       res_l = gen_reg_rtx (himode);
   26163            2 :       res_h = gen_reg_rtx (himode);
   26164            2 :       emit_insn (gen_rtx_SET (res_l,
   26165              :                               simplify_gen_binary (code, himode,
   26166              :                                                    op1_l, op2_l)));
   26167            2 :       emit_insn (gen_rtx_SET (res_h,
   26168              :                               simplify_gen_binary (code, himode,
   26169              :                                                    op1_h, op2_h)));
   26170              :     }
   26171              :   else
   26172              :     {
   26173              :       /* Expand mult/ashr/lshr/ashl.  */
   26174          841 :       res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
   26175              :                                    1, OPTAB_DIRECT);
   26176          841 :       res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
   26177              :                                    1, OPTAB_DIRECT);
   26178              :     }
   26179              : 
   26180          843 :   gcc_assert (res_l && res_h);
   26181              : 
   26182              :   /* Merge the data back into the right place.  */
   26183          843 :   d.target = dest;
   26184          843 :   d.op0 = gen_lowpart (qimode, res_l);
   26185          843 :   d.op1 = gen_lowpart (qimode, res_h);
   26186          843 :   d.vmode = qimode;
   26187          843 :   d.nelt = GET_MODE_NUNITS (qimode);
   26188          843 :   d.one_operand_p = false;
   26189          843 :   d.testing_p = false;
   26190              : 
   26191          843 :   if (full_interleave)
   26192              :     {
   26193              :       /* We used the full interleave, the desired
   26194              :          results are in the even elements.  */
   26195        13543 :       for (i = 0; i < d.nelt; ++i)
   26196        12752 :         d.perm[i] = i * 2;
   26197              :     }
   26198              :   else
   26199              :     {
   26200              :       /* For AVX, the interleave used above was not cross-lane.  So the
   26201              :          extraction is evens but with the second and third quarter swapped.
   26202              :          Happily, that is even one insn shorter than even extraction.
   26203              :          For AVX512BW we have 4 lanes.  We extract evens from within a lane,
   26204              :          always first from the first and then from the second source operand,
   26205              :          the index bits above the low 4 bits remains the same.
   26206              :          Thus, for d.nelt == 32 we want permutation
   26207              :          0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
   26208              :          and for d.nelt == 64 we want permutation
   26209              :          0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
   26210              :          32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
   26211         1972 :       for (i = 0; i < d.nelt; ++i)
   26212         2880 :         d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
   26213              :     }
   26214              : 
   26215          843 :   ok = ix86_expand_vec_perm_const_1 (&d);
   26216          843 :   gcc_assert (ok);
   26217              : }
   26218              : 
   26219              : /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
   26220              :    if op is CONST_VECTOR with all odd elements equal to their
   26221              :    preceding element.  */
   26222              : 
   26223              : static bool
   26224         8744 : const_vector_equal_evenodd_p (rtx op)
   26225              : {
   26226         8744 :   machine_mode mode = GET_MODE (op);
   26227         8744 :   int i, nunits = GET_MODE_NUNITS (mode);
   26228         8744 :   if (!CONST_VECTOR_P (op)
   26229         8744 :       || nunits != CONST_VECTOR_NUNITS (op))
   26230              :     return false;
   26231         3560 :   for (i = 0; i < nunits; i += 2)
   26232         2869 :     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
   26233              :       return false;
   26234              :   return true;
   26235              : }
   26236              : 
   26237              : void
   26238         8856 : ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
   26239              :                                bool uns_p, bool odd_p)
   26240              : {
   26241         8856 :   machine_mode mode = GET_MODE (op1);
   26242         8856 :   machine_mode wmode = GET_MODE (dest);
   26243         8856 :   rtx x;
   26244         8856 :   rtx orig_op1 = op1, orig_op2 = op2;
   26245              : 
   26246         8856 :   if (!nonimmediate_operand (op1, mode))
   26247            0 :     op1 = force_reg (mode, op1);
   26248         8856 :   if (!nonimmediate_operand (op2, mode))
   26249         3316 :     op2 = force_reg (mode, op2);
   26250              : 
   26251              :   /* We only play even/odd games with vectors of SImode.  */
   26252         8856 :   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
   26253              : 
   26254              :   /* If we're looking for the odd results, shift those members down to
   26255              :      the even slots.  For some cpus this is faster than a PSHUFD.  */
   26256         8856 :   if (odd_p)
   26257              :     {
   26258              :       /* For XOP use vpmacsdqh, but only for smult, as it is only
   26259              :          signed.  */
   26260         4390 :       if (TARGET_XOP && mode == V4SImode && !uns_p)
   26261              :         {
   26262           18 :           x = force_reg (wmode, CONST0_RTX (wmode));
   26263           18 :           emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
   26264           18 :           return;
   26265              :         }
   26266              : 
   26267         8744 :       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
   26268         4372 :       if (!const_vector_equal_evenodd_p (orig_op1))
   26269         4372 :         op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
   26270              :                             x, NULL, 1, OPTAB_DIRECT);
   26271         4372 :       if (!const_vector_equal_evenodd_p (orig_op2))
   26272         3681 :         op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
   26273              :                             x, NULL, 1, OPTAB_DIRECT);
   26274         4372 :       op1 = gen_lowpart (mode, op1);
   26275         4372 :       op2 = gen_lowpart (mode, op2);
   26276              :     }
   26277              : 
   26278         8838 :   if (mode == V16SImode)
   26279              :     {
   26280            6 :       if (uns_p)
   26281            0 :         x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
   26282              :       else
   26283            6 :         x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
   26284              :     }
   26285         8832 :   else if (mode == V8SImode)
   26286              :     {
   26287          139 :       if (uns_p)
   26288           59 :         x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
   26289              :       else
   26290           80 :         x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
   26291              :     }
   26292         8693 :   else if (uns_p)
   26293         7638 :     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
   26294         1055 :   else if (TARGET_SSE4_1)
   26295          369 :     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
   26296              :   else
   26297              :     {
   26298          686 :       rtx s1, s2, t0, t1, t2;
   26299              : 
   26300              :       /* The easiest way to implement this without PMULDQ is to go through
   26301              :          the motions as if we are performing a full 64-bit multiply.  With
   26302              :          the exception that we need to do less shuffling of the elements.  */
   26303              : 
   26304              :       /* Compute the sign-extension, aka highparts, of the two operands.  */
   26305          686 :       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
   26306              :                                 op1, pc_rtx, pc_rtx);
   26307          686 :       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
   26308              :                                 op2, pc_rtx, pc_rtx);
   26309              : 
   26310              :       /* Multiply LO(A) * HI(B), and vice-versa.  */
   26311          686 :       t1 = gen_reg_rtx (wmode);
   26312          686 :       t2 = gen_reg_rtx (wmode);
   26313          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
   26314          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
   26315              : 
   26316              :       /* Multiply LO(A) * LO(B).  */
   26317          686 :       t0 = gen_reg_rtx (wmode);
   26318          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
   26319              : 
   26320              :       /* Combine and shift the highparts into place.  */
   26321          686 :       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
   26322          686 :       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
   26323              :                          1, OPTAB_DIRECT);
   26324              : 
   26325              :       /* Combine high and low parts.  */
   26326          686 :       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
   26327          686 :       return;
   26328              :     }
   26329         8152 :   emit_insn (x);
   26330              : }
   26331              : 
   26332              : void
   26333          975 : ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
   26334              :                             bool uns_p, bool high_p)
   26335              : {
   26336          975 :   machine_mode wmode = GET_MODE (dest);
   26337          975 :   machine_mode mode = GET_MODE (op1);
   26338          975 :   rtx t1, t2, t3, t4, mask;
   26339              : 
   26340          975 :   switch (mode)
   26341              :     {
   26342          297 :     case E_V4SImode:
   26343          297 :       t1 = gen_reg_rtx (mode);
   26344          297 :       t2 = gen_reg_rtx (mode);
   26345          297 :       if (TARGET_XOP && !uns_p)
   26346              :         {
   26347              :           /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
   26348              :              shuffle the elements once so that all elements are in the right
   26349              :              place for immediate use: { A C B D }.  */
   26350           33 :           emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
   26351              :                                         const1_rtx, GEN_INT (3)));
   26352           33 :           emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
   26353              :                                         const1_rtx, GEN_INT (3)));
   26354              :         }
   26355              :       else
   26356              :         {
   26357              :           /* Put the elements into place for the multiply.  */
   26358          264 :           ix86_expand_vec_interleave (t1, op1, op1, high_p);
   26359          264 :           ix86_expand_vec_interleave (t2, op2, op2, high_p);
   26360          264 :           high_p = false;
   26361              :         }
   26362          297 :       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
   26363          297 :       break;
   26364              : 
   26365           70 :     case E_V8SImode:
   26366              :       /* Shuffle the elements between the lanes.  After this we
   26367              :          have { A B E F | C D G H } for each operand.  */
   26368           70 :       t1 = gen_reg_rtx (V4DImode);
   26369           70 :       t2 = gen_reg_rtx (V4DImode);
   26370           70 :       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
   26371              :                                       const0_rtx, const2_rtx,
   26372              :                                       const1_rtx, GEN_INT (3)));
   26373           70 :       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
   26374              :                                       const0_rtx, const2_rtx,
   26375              :                                       const1_rtx, GEN_INT (3)));
   26376              : 
   26377              :       /* Shuffle the elements within the lanes.  After this we
   26378              :          have { A A B B | C C D D } or { E E F F | G G H H }.  */
   26379           70 :       t3 = gen_reg_rtx (V8SImode);
   26380           70 :       t4 = gen_reg_rtx (V8SImode);
   26381          105 :       mask = GEN_INT (high_p
   26382              :                       ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
   26383              :                       : 0 + (0 << 2) + (1 << 4) + (1 << 6));
   26384           70 :       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
   26385           70 :       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
   26386              : 
   26387           70 :       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
   26388           70 :       break;
   26389              : 
   26390          394 :     case E_V8HImode:
   26391          394 :     case E_V16HImode:
   26392          394 :       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
   26393              :                          uns_p, OPTAB_DIRECT);
   26394          626 :       t2 = expand_binop (mode,
   26395              :                          uns_p ? umul_highpart_optab : smul_highpart_optab,
   26396              :                          op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
   26397          394 :       gcc_assert (t1 && t2);
   26398              : 
   26399          394 :       t3 = gen_reg_rtx (mode);
   26400          394 :       ix86_expand_vec_interleave (t3, t1, t2, high_p);
   26401          394 :       emit_move_insn (dest, gen_lowpart (wmode, t3));
   26402          394 :       break;
   26403              : 
   26404          214 :     case E_V16QImode:
   26405          214 :     case E_V32QImode:
   26406          214 :     case E_V32HImode:
   26407          214 :     case E_V16SImode:
   26408          214 :     case E_V64QImode:
   26409          214 :       t1 = gen_reg_rtx (wmode);
   26410          214 :       t2 = gen_reg_rtx (wmode);
   26411          214 :       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
   26412          214 :       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
   26413              : 
   26414          214 :       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
   26415          214 :       break;
   26416              : 
   26417            0 :     default:
   26418            0 :       gcc_unreachable ();
   26419              :     }
   26420          975 : }
   26421              : 
   26422              : void
   26423         3651 : ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
   26424              : {
   26425         3651 :   rtx res_1, res_2, res_3, res_4;
   26426              : 
   26427         3651 :   res_1 = gen_reg_rtx (V4SImode);
   26428         3651 :   res_2 = gen_reg_rtx (V4SImode);
   26429         3651 :   res_3 = gen_reg_rtx (V2DImode);
   26430         3651 :   res_4 = gen_reg_rtx (V2DImode);
   26431         3651 :   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
   26432         3651 :   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
   26433              : 
   26434              :   /* Move the results in element 2 down to element 1; we don't care
   26435              :      what goes in elements 2 and 3.  Then we can merge the parts
   26436              :      back together with an interleave.
   26437              : 
   26438              :      Note that two other sequences were tried:
   26439              :      (1) Use interleaves at the start instead of psrldq, which allows
   26440              :      us to use a single shufps to merge things back at the end.
   26441              :      (2) Use shufps here to combine the two vectors, then pshufd to
   26442              :      put the elements in the correct order.
   26443              :      In both cases the cost of the reformatting stall was too high
   26444              :      and the overall sequence slower.  */
   26445              : 
   26446         3651 :   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
   26447              :                                 const0_rtx, const2_rtx,
   26448              :                                 const0_rtx, const0_rtx));
   26449         3651 :   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
   26450              :                                 const0_rtx, const2_rtx,
   26451              :                                 const0_rtx, const0_rtx));
   26452         3651 :   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
   26453              : 
   26454         3651 :   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
   26455         3651 : }
   26456              : 
   26457              : void
   26458          527 : ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
   26459              : {
   26460          527 :   machine_mode mode = GET_MODE (op0);
   26461          527 :   rtx t1, t2, t3, t4, t5, t6;
   26462              : 
   26463          527 :   if (TARGET_AVX512DQ && mode == V8DImode)
   26464           32 :     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
   26465          495 :   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
   26466           32 :     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
   26467          463 :   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
   26468           36 :     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
   26469          427 :   else if (TARGET_XOP && mode == V2DImode)
   26470              :     {
   26471              :       /* op1: A,B,C,D, op2: E,F,G,H */
   26472            2 :       op1 = gen_lowpart (V4SImode, op1);
   26473            2 :       op2 = gen_lowpart (V4SImode, op2);
   26474              : 
   26475            2 :       t1 = gen_reg_rtx (V4SImode);
   26476            2 :       t2 = gen_reg_rtx (V4SImode);
   26477            2 :       t3 = gen_reg_rtx (V2DImode);
   26478            2 :       t4 = gen_reg_rtx (V2DImode);
   26479              : 
   26480              :       /* t1: B,A,D,C */
   26481            2 :       emit_insn (gen_sse2_pshufd_1 (t1, op1,
   26482              :                                     GEN_INT (1),
   26483              :                                     GEN_INT (0),
   26484              :                                     GEN_INT (3),
   26485              :                                     GEN_INT (2)));
   26486              : 
   26487              :       /* t2: (B*E),(A*F),(D*G),(C*H) */
   26488            2 :       emit_insn (gen_mulv4si3 (t2, t1, op2));
   26489              : 
   26490              :       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
   26491            2 :       emit_insn (gen_xop_phadddq (t3, t2));
   26492              : 
   26493              :       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
   26494            2 :       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
   26495              : 
   26496              :       /* Multiply lower parts and add all */
   26497            2 :       t5 = gen_reg_rtx (V2DImode);
   26498            2 :       emit_insn (gen_vec_widen_umult_even_v4si (t5,
   26499            2 :                                         gen_lowpart (V4SImode, op1),
   26500            2 :                                         gen_lowpart (V4SImode, op2)));
   26501            2 :       force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
   26502              :     }
   26503              :   else
   26504              :     {
   26505          425 :       machine_mode nmode;
   26506          425 :       rtx (*umul) (rtx, rtx, rtx);
   26507              : 
   26508          425 :       if (mode == V2DImode)
   26509              :         {
   26510              :           umul = gen_vec_widen_umult_even_v4si;
   26511              :           nmode = V4SImode;
   26512              :         }
   26513          295 :       else if (mode == V4DImode)
   26514              :         {
   26515              :           umul = gen_vec_widen_umult_even_v8si;
   26516              :           nmode = V8SImode;
   26517              :         }
   26518          116 :       else if (mode == V8DImode)
   26519              :         {
   26520              :           umul = gen_vec_widen_umult_even_v16si;
   26521              :           nmode = V16SImode;
   26522              :         }
   26523              :       else
   26524            0 :         gcc_unreachable ();
   26525              : 
   26526              : 
   26527              :       /* Multiply low parts.  */
   26528          425 :       t1 = gen_reg_rtx (mode);
   26529          425 :       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
   26530              : 
   26531              :       /* Shift input vectors right 32 bits so we can multiply high parts.  */
   26532          425 :       t6 = GEN_INT (32);
   26533          425 :       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
   26534          425 :       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
   26535              : 
   26536              :       /* Multiply high parts by low parts.  */
   26537          425 :       t4 = gen_reg_rtx (mode);
   26538          425 :       t5 = gen_reg_rtx (mode);
   26539          425 :       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
   26540          425 :       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
   26541              : 
   26542              :       /* Combine and shift the highparts back.  */
   26543          425 :       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
   26544          425 :       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
   26545              : 
   26546              :       /* Combine high and low parts.  */
   26547          425 :       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
   26548              :     }
   26549              : 
   26550          527 :   set_unique_reg_note (get_last_insn (), REG_EQUAL,
   26551              :                        gen_rtx_MULT (mode, op1, op2));
   26552          527 : }
   26553              : 
   26554              : /* Return 1 if control tansfer instruction INSN
   26555              :    should be encoded with notrack prefix.  */
   26556              : 
   26557              : bool
   26558     14849053 : ix86_notrack_prefixed_insn_p (rtx_insn *insn)
   26559              : {
   26560     14849053 :   if (!insn || !((flag_cf_protection & CF_BRANCH)))
   26561              :     return false;
   26562              : 
   26563      3918377 :   if (CALL_P (insn))
   26564              :     {
   26565      1387738 :       rtx call = get_call_rtx_from (insn);
   26566      1387738 :       gcc_assert (call != NULL_RTX);
   26567      1387738 :       rtx addr = XEXP (call, 0);
   26568              : 
   26569              :       /* Do not emit 'notrack' if it's not an indirect call.  */
   26570      1387738 :       if (MEM_P (addr)
   26571      1387738 :           && SYMBOL_REF_P (XEXP (addr, 0)))
   26572              :         return false;
   26573              :       else
   26574        64646 :         return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
   26575              :     }
   26576              : 
   26577      2530639 :   if (JUMP_P (insn) && !flag_cet_switch)
   26578              :     {
   26579      2517084 :       rtx target = JUMP_LABEL (insn);
   26580      2517084 :       if (target == NULL_RTX || ANY_RETURN_P (target))
   26581              :         return false;
   26582              : 
   26583              :       /* Check the jump is a switch table.  */
   26584      2517046 :       rtx_insn *label = as_a<rtx_insn *> (target);
   26585      2517046 :       rtx_insn *table = next_insn (label);
   26586      2517046 :       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
   26587              :         return false;
   26588              :       else
   26589              :         return true;
   26590              :     }
   26591              :   return false;
   26592              : }
   26593              : 
   26594              : /* Calculate integer abs() using only SSE2 instructions.  */
   26595              : 
   26596              : void
   26597          552 : ix86_expand_sse2_abs (rtx target, rtx input)
   26598              : {
   26599          552 :   machine_mode mode = GET_MODE (target);
   26600          552 :   rtx tmp0, tmp1, x;
   26601              : 
   26602          552 :   switch (mode)
   26603              :     {
   26604           33 :     case E_V2DImode:
   26605           33 :     case E_V4DImode:
   26606              :       /* For 64-bit signed integer X, with SSE4.2 use
   26607              :          pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
   26608              :          Otherwise handle it similarly to V4SImode, except use 64 as W instead of
   26609              :          32 and use logical instead of arithmetic right shift (which is
   26610              :          unimplemented) and subtract.  */
   26611           33 :       if (TARGET_SSE4_2)
   26612              :         {
   26613            9 :           tmp0 = gen_reg_rtx (mode);
   26614            9 :           tmp1 = gen_reg_rtx (mode);
   26615            9 :           emit_move_insn (tmp1, CONST0_RTX (mode));
   26616            9 :           if (mode == E_V2DImode)
   26617            6 :             emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
   26618              :           else
   26619            3 :             emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
   26620              :         }
   26621              :       else
   26622              :         {
   26623           48 :           tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
   26624           24 :                                       GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
   26625              :                                                - 1), NULL, 0, OPTAB_DIRECT);
   26626           24 :           tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
   26627              :         }
   26628              : 
   26629           33 :       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
   26630              :                                   NULL, 0, OPTAB_DIRECT);
   26631           33 :       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
   26632              :                                target, 0, OPTAB_DIRECT);
   26633           33 :       break;
   26634              : 
   26635           61 :     case E_V4SImode:
   26636              :       /* For 32-bit signed integer X, the best way to calculate the absolute
   26637              :          value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
   26638           61 :       tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
   26639           61 :                                   GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
   26640              :                                   NULL, 0, OPTAB_DIRECT);
   26641           61 :       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
   26642              :                                   NULL, 0, OPTAB_DIRECT);
   26643           61 :       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
   26644              :                                target, 0, OPTAB_DIRECT);
   26645           61 :       break;
   26646              : 
   26647           91 :     case E_V8HImode:
   26648              :       /* For 16-bit signed integer X, the best way to calculate the absolute
   26649              :          value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
   26650           91 :       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
   26651              : 
   26652           91 :       x = expand_simple_binop (mode, SMAX, tmp0, input,
   26653              :                                target, 0, OPTAB_DIRECT);
   26654           91 :       break;
   26655              : 
   26656          367 :     case E_V16QImode:
   26657              :       /* For 8-bit signed integer X, the best way to calculate the absolute
   26658              :          value of X is min ((unsigned char) X, (unsigned char) (-X)),
   26659              :          as SSE2 provides the PMINUB insn.  */
   26660          367 :       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
   26661              : 
   26662          367 :       x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
   26663              :                                target, 0, OPTAB_DIRECT);
   26664          367 :       break;
   26665              : 
   26666            0 :     default:
   26667            0 :       gcc_unreachable ();
   26668              :     }
   26669              : 
   26670          552 :   if (x != target)
   26671            0 :     emit_move_insn (target, x);
   26672          552 : }
   26673              : 
   26674              : /* Expand an extract from a vector register through pextr insn.
   26675              :    Return true if successful.  */
   26676              : 
   26677              : bool
   26678       101780 : ix86_expand_pextr (rtx *operands)
   26679              : {
   26680       101780 :   rtx dst = operands[0];
   26681       101780 :   rtx src = operands[1];
   26682              : 
   26683       101780 :   unsigned int size = INTVAL (operands[2]);
   26684       101780 :   unsigned int pos = INTVAL (operands[3]);
   26685              : 
   26686       101780 :   if (SUBREG_P (dst))
   26687              :     {
   26688              :       /* Reject non-lowpart subregs.  */
   26689        58612 :       if (SUBREG_BYTE (dst) > 0)
   26690              :         return false;
   26691        58483 :       dst = SUBREG_REG (dst);
   26692              :     }
   26693              : 
   26694       101651 :   if (SUBREG_P (src))
   26695              :     {
   26696        33907 :       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
   26697        33907 :       src = SUBREG_REG (src);
   26698              :     }
   26699              : 
   26700       101651 :   switch (GET_MODE (src))
   26701              :     {
   26702            0 :     case E_V16QImode:
   26703            0 :     case E_V8HImode:
   26704            0 :     case E_V4SImode:
   26705            0 :     case E_V2DImode:
   26706            0 :     case E_V1TImode:
   26707            0 :       {
   26708            0 :         machine_mode srcmode, dstmode;
   26709            0 :         rtx d, pat;
   26710              : 
   26711            0 :         if (!int_mode_for_size (size, 0).exists (&dstmode))
   26712            0 :           return false;
   26713              : 
   26714            0 :         switch (dstmode)
   26715              :           {
   26716            0 :           case E_QImode:
   26717            0 :             if (!TARGET_SSE4_1)
   26718              :               return false;
   26719              :             srcmode = V16QImode;
   26720              :             break;
   26721              : 
   26722            0 :           case E_HImode:
   26723            0 :             if (!TARGET_SSE2)
   26724              :               return false;
   26725              :             srcmode = V8HImode;
   26726              :             break;
   26727              : 
   26728            0 :           case E_SImode:
   26729            0 :             if (!TARGET_SSE4_1)
   26730              :               return false;
   26731              :             srcmode = V4SImode;
   26732              :             break;
   26733              : 
   26734            0 :           case E_DImode:
   26735            0 :             gcc_assert (TARGET_64BIT);
   26736            0 :             if (!TARGET_SSE4_1)
   26737              :               return false;
   26738              :             srcmode = V2DImode;
   26739              :             break;
   26740              : 
   26741              :           default:
   26742              :             return false;
   26743              :           }
   26744              : 
   26745              :         /* Reject extractions from misaligned positions.  */
   26746            0 :         if (pos & (size-1))
   26747              :           return false;
   26748              : 
   26749            0 :         if (GET_MODE (dst) == dstmode)
   26750              :           d = dst;
   26751              :         else
   26752            0 :           d = gen_reg_rtx (dstmode);
   26753              : 
   26754              :         /* Construct insn pattern.  */
   26755            0 :         pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
   26756            0 :         pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
   26757              : 
   26758              :         /* Let the rtl optimizers know about the zero extension performed.  */
   26759            0 :         if (dstmode == QImode || dstmode == HImode)
   26760              :           {
   26761            0 :             pat = gen_rtx_ZERO_EXTEND (SImode, pat);
   26762            0 :             d = gen_lowpart (SImode, d);
   26763              :           }
   26764              : 
   26765            0 :         emit_insn (gen_rtx_SET (d, pat));
   26766              : 
   26767            0 :         if (d != dst)
   26768            0 :           emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
   26769              :         return true;
   26770              :       }
   26771              : 
   26772              :     default:
   26773              :       return false;
   26774              :     }
   26775              : }
   26776              : 
   26777              : /* Expand an insert into a vector register through pinsr insn.
   26778              :    Return true if successful.  */
   26779              : 
   26780              : bool
   26781       109797 : ix86_expand_pinsr (rtx *operands)
   26782              : {
   26783       109797 :   rtx dst = operands[0];
   26784       109797 :   rtx src = operands[3];
   26785              : 
   26786       109797 :   unsigned int size = INTVAL (operands[1]);
   26787       109797 :   unsigned int pos = INTVAL (operands[2]);
   26788              : 
   26789       109797 :   if (SUBREG_P (dst))
   26790              :     {
   26791        61699 :       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
   26792        61699 :       dst = SUBREG_REG (dst);
   26793              :     }
   26794              : 
   26795       109797 :   switch (GET_MODE (dst))
   26796              :     {
   26797           20 :     case E_V16QImode:
   26798           20 :     case E_V8HImode:
   26799           20 :     case E_V4SImode:
   26800           20 :     case E_V2DImode:
   26801           20 :     case E_V1TImode:
   26802           20 :       {
   26803           20 :         machine_mode srcmode, dstmode;
   26804           20 :         rtx (*pinsr)(rtx, rtx, rtx, rtx);
   26805           20 :         rtx d;
   26806              : 
   26807           20 :         if (!int_mode_for_size (size, 0).exists (&srcmode))
   26808            0 :           return false;
   26809              : 
   26810           20 :         switch (srcmode)
   26811              :           {
   26812            1 :           case E_QImode:
   26813            1 :             if (!TARGET_SSE4_1)
   26814              :               return false;
   26815              :             dstmode = V16QImode;
   26816              :             pinsr = gen_sse4_1_pinsrb;
   26817              :             break;
   26818              : 
   26819            5 :           case E_HImode:
   26820            5 :             if (!TARGET_SSE2)
   26821              :               return false;
   26822              :             dstmode = V8HImode;
   26823              :             pinsr = gen_sse2_pinsrw;
   26824              :             break;
   26825              : 
   26826           14 :           case E_SImode:
   26827           14 :             if (!TARGET_SSE4_1)
   26828              :               return false;
   26829              :             dstmode = V4SImode;
   26830              :             pinsr = gen_sse4_1_pinsrd;
   26831              :             break;
   26832              : 
   26833            0 :           case E_DImode:
   26834            0 :             gcc_assert (TARGET_64BIT);
   26835            0 :             if (!TARGET_SSE4_1)
   26836              :               return false;
   26837              :             dstmode = V2DImode;
   26838              :             pinsr = gen_sse4_1_pinsrq;
   26839              :             break;
   26840              : 
   26841              :           default:
   26842              :             return false;
   26843              :           }
   26844              : 
   26845              :         /* Reject insertions to misaligned positions.  */
   26846            7 :         if (pos & (size-1))
   26847              :           return false;
   26848              : 
   26849            7 :         if (SUBREG_P (src))
   26850              :           {
   26851            7 :             unsigned int srcpos = SUBREG_BYTE (src);
   26852              : 
   26853            7 :             if (srcpos > 0)
   26854              :               {
   26855            0 :                 rtx extr_ops[4];
   26856              : 
   26857            0 :                 extr_ops[0] = gen_reg_rtx (srcmode);
   26858            0 :                 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
   26859            0 :                 extr_ops[2] = GEN_INT (size);
   26860            0 :                 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
   26861              : 
   26862            0 :                 if (!ix86_expand_pextr (extr_ops))
   26863            0 :                   return false;
   26864              : 
   26865            0 :                 src = extr_ops[0];
   26866              :               }
   26867              :             else
   26868            7 :               src = gen_lowpart (srcmode, SUBREG_REG (src));
   26869              :           }
   26870              : 
   26871            7 :         if (GET_MODE (dst) == dstmode)
   26872              :           d = dst;
   26873              :         else
   26874            7 :           d = gen_reg_rtx (dstmode);
   26875              : 
   26876            7 :         emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
   26877            7 :                           gen_lowpart (srcmode, src),
   26878            7 :                           GEN_INT (1 << (pos / size))));
   26879            7 :         if (d != dst)
   26880            7 :           emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
   26881              :         return true;
   26882              :       }
   26883              : 
   26884              :     default:
   26885              :       return false;
   26886              :     }
   26887              : }
   26888              : 
   26889              : /* All CPUs prefer to avoid cross-lane operations so perform reductions
   26890              :    upper against lower halves up to SSE reg size.  */
   26891              : 
   26892              : machine_mode
   26893         1992 : ix86_split_reduction (machine_mode mode)
   26894              : {
   26895              :   /* Reduce lowpart against highpart until we reach SSE reg width to
   26896              :      avoid cross-lane operations.  */
   26897         1992 :   switch (mode)
   26898              :     {
   26899              :     case E_V8DImode:
   26900              :     case E_V4DImode:
   26901              :       return V2DImode;
   26902            9 :     case E_V16SImode:
   26903            9 :     case E_V8SImode:
   26904            9 :       return V4SImode;
   26905            8 :     case E_V32HImode:
   26906            8 :     case E_V16HImode:
   26907            8 :       return V8HImode;
   26908            4 :     case E_V64QImode:
   26909            4 :     case E_V32QImode:
   26910            4 :       return V16QImode;
   26911            5 :     case E_V16SFmode:
   26912            5 :     case E_V8SFmode:
   26913            5 :       return V4SFmode;
   26914           16 :     case E_V8DFmode:
   26915           16 :     case E_V4DFmode:
   26916           16 :       return V2DFmode;
   26917         1945 :     default:
   26918         1945 :       return mode;
   26919              :     }
   26920              : }
   26921              : 
   26922              : /* Generate call to __divmoddi4.  */
   26923              : 
   26924              : void
   26925          896 : ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
   26926              :                             rtx op0, rtx op1,
   26927              :                             rtx *quot_p, rtx *rem_p)
   26928              : {
   26929         1792 :   rtx rem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   26930              : 
   26931          896 :   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
   26932              :                                       mode, op0, mode, op1, mode,
   26933          896 :                                       XEXP (rem, 0), Pmode);
   26934          896 :   *quot_p = quot;
   26935          896 :   *rem_p = rem;
   26936          896 : }
   26937              : 
   26938              : void
   26939           64 : ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
   26940              :                                   enum rtx_code code, bool after,
   26941              :                                   bool doubleword)
   26942              : {
   26943           64 :   rtx old_reg, new_reg, old_mem, success;
   26944           64 :   machine_mode mode = GET_MODE (target);
   26945           64 :   rtx_code_label *loop_label = NULL;
   26946              : 
   26947           64 :   old_reg = gen_reg_rtx (mode);
   26948           64 :   new_reg = old_reg;
   26949           64 :   old_mem = copy_to_reg (mem);
   26950           64 :   loop_label = gen_label_rtx ();
   26951           64 :   emit_label (loop_label);
   26952           64 :   emit_move_insn (old_reg, old_mem);
   26953              : 
   26954              :   /* return value for atomic_fetch_op.  */
   26955           64 :   if (!after)
   26956           32 :     emit_move_insn (target, old_reg);
   26957              : 
   26958           64 :   if (code == NOT)
   26959              :     {
   26960           16 :       new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
   26961              :                                      true, OPTAB_LIB_WIDEN);
   26962           16 :       new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
   26963              :     }
   26964              :   else
   26965           48 :     new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
   26966              :                                    true, OPTAB_LIB_WIDEN);
   26967              : 
   26968              :   /* return value for atomic_op_fetch.  */
   26969           64 :   if (after)
   26970           32 :     emit_move_insn (target, new_reg);
   26971              : 
   26972           64 :   success = NULL_RTX;
   26973              : 
   26974           64 :   ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
   26975              :                             gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
   26976              :                                           SImode),
   26977              :                             doubleword, loop_label);
   26978           64 : }
   26979              : 
   26980              : /* Relax cmpxchg instruction, param loop_label indicates whether
   26981              :    the instruction should be relaxed with a pause loop.  If not,
   26982              :    it will be relaxed to an atomic load + compare, and skip
   26983              :    cmpxchg instruction if mem != exp_input.  */
   26984              : 
   26985              : void
   26986           72 : ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
   26987              :                           rtx mem, rtx exp_input, rtx new_input,
   26988              :                           rtx mem_model, bool doubleword,
   26989              :                           rtx_code_label *loop_label)
   26990              : {
   26991           72 :   rtx_code_label *cmp_label = NULL;
   26992           72 :   rtx_code_label *done_label = NULL;
   26993           72 :   rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
   26994           72 :   rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
   26995           72 :   rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
   26996           72 :   machine_mode mode = GET_MODE (target_val), hmode = mode;
   26997              : 
   26998           72 :   if (*ptarget_bool == NULL)
   26999           64 :     target_bool = gen_reg_rtx (QImode);
   27000              :   else
   27001              :     target_bool = *ptarget_bool;
   27002              : 
   27003           72 :   cmp_label = gen_label_rtx ();
   27004           72 :   done_label = gen_label_rtx ();
   27005              : 
   27006           72 :   new_mem = gen_reg_rtx (mode);
   27007              :   /* Load memory first.  */
   27008           72 :   expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
   27009              : 
   27010           72 :   switch (mode)
   27011              :     {
   27012              :     case E_TImode:
   27013              :       gendw = gen_atomic_compare_and_swapti_doubleword;
   27014              :       hmode = DImode;
   27015              :       break;
   27016           18 :     case E_DImode:
   27017           18 :       if (doubleword)
   27018              :         {
   27019              :           gendw = gen_atomic_compare_and_swapdi_doubleword;
   27020              :           hmode = SImode;
   27021              :         }
   27022              :       else
   27023              :         gen = gen_atomic_compare_and_swapdi_1;
   27024              :       break;
   27025           18 :     case E_SImode:
   27026           18 :       gen = gen_atomic_compare_and_swapsi_1;
   27027           18 :       break;
   27028           18 :     case E_HImode:
   27029           18 :       gen = gen_atomic_compare_and_swaphi_1;
   27030           18 :       break;
   27031           18 :     case E_QImode:
   27032           18 :       gen = gen_atomic_compare_and_swapqi_1;
   27033           18 :       break;
   27034            0 :     default:
   27035            0 :       gcc_unreachable ();
   27036              :     }
   27037              : 
   27038              :   /* Compare mem value with expected value.  */
   27039           54 :   if (doubleword)
   27040              :     {
   27041            0 :       rtx low_new_mem = gen_lowpart (hmode, new_mem);
   27042            0 :       rtx low_exp_input = gen_lowpart (hmode, exp_input);
   27043            0 :       rtx high_new_mem = gen_highpart (hmode, new_mem);
   27044            0 :       rtx high_exp_input = gen_highpart (hmode, exp_input);
   27045            0 :       emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
   27046              :                                hmode, 1, cmp_label,
   27047              :                                profile_probability::guessed_never ());
   27048            0 :       emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
   27049              :                                hmode, 1, cmp_label,
   27050              :                                profile_probability::guessed_never ());
   27051              :     }
   27052              :   else
   27053           72 :     emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
   27054           72 :                              GET_MODE (exp_input), 1, cmp_label,
   27055              :                              profile_probability::guessed_never ());
   27056              : 
   27057              :   /* Directly emits cmpxchg here.  */
   27058           72 :   if (doubleword)
   27059            0 :     emit_insn (gendw (target_val, mem, exp_input,
   27060            0 :                       gen_lowpart (hmode, new_input),
   27061              :                       gen_highpart (hmode, new_input),
   27062              :                       mem_model));
   27063              :   else
   27064           72 :     emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
   27065              : 
   27066           72 :   if (!loop_label)
   27067              :   {
   27068            8 :     emit_jump_insn (gen_jump (done_label));
   27069            8 :     emit_barrier ();
   27070            8 :     emit_label (cmp_label);
   27071            8 :     emit_move_insn (target_val, new_mem);
   27072            8 :     emit_label (done_label);
   27073            8 :     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
   27074              :                        const0_rtx);
   27075              :   }
   27076              :   else
   27077              :   {
   27078           64 :     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
   27079              :                        const0_rtx);
   27080           64 :     emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
   27081           64 :                              GET_MODE (target_bool), 1, loop_label,
   27082              :                              profile_probability::guessed_never ());
   27083           64 :     emit_jump_insn (gen_jump (done_label));
   27084           64 :     emit_barrier ();
   27085              : 
   27086              :     /* If mem is not expected, pause and loop back.  */
   27087           64 :     emit_label (cmp_label);
   27088           64 :     emit_move_insn (target_val, new_mem);
   27089           64 :     emit_insn (gen_pause ());
   27090           64 :     emit_jump_insn (gen_jump (loop_label));
   27091           64 :     emit_barrier ();
   27092           64 :     emit_label (done_label);
   27093              :   }
   27094              : 
   27095           72 :   *ptarget_bool = target_bool;
   27096           72 : }
   27097              : 
   27098              : /* Convert a BFmode VAL to SFmode without signaling sNaNs.
   27099              :    This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16.  */
   27100              : 
   27101              : rtx
   27102         2832 : ix86_expand_fast_convert_bf_to_sf (rtx val)
   27103              : {
   27104         2832 :   rtx op = gen_lowpart (HImode, val), ret;
   27105         2832 :   if (CONST_INT_P (op))
   27106              :     {
   27107          514 :       ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
   27108              :                                             val, BFmode);
   27109          514 :       if (ret)
   27110              :         return ret;
   27111              :       /* FLOAT_EXTEND simplification will fail if VAL is a sNaN.  */
   27112            1 :       ret = gen_reg_rtx (SImode);
   27113            1 :       emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
   27114            1 :       emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
   27115            1 :       return gen_lowpart (SFmode, ret);
   27116              :     }
   27117              : 
   27118         2318 :   ret = gen_reg_rtx (SFmode);
   27119         2318 :   emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
   27120         2318 :   return ret;
   27121              : }
   27122              : 
   27123              : rtx
   27124        65576 : ix86_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
   27125              :                         rtx_code code, tree treeop0, tree treeop1)
   27126              : {
   27127        65576 :   if (!TARGET_APX_CCMP)
   27128              :     return NULL_RTX;
   27129              : 
   27130        65576 :   rtx op0, op1, res;
   27131        65576 :   machine_mode op_mode;
   27132              : 
   27133        65576 :   start_sequence ();
   27134        65576 :   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
   27135              : 
   27136        65576 :   op_mode = GET_MODE (op0);
   27137        65576 :   if (op_mode == VOIDmode)
   27138            0 :     op_mode = GET_MODE (op1);
   27139              : 
   27140              :   /* We only supports following scalar comparisons that use just 1
   27141              :      instruction: DI/SI/QI/HI/DF/SF/HF.
   27142              :      Unordered/Ordered compare cannot be corretly indentified by
   27143              :      ccmp so they are not supported.  */
   27144        98348 :   if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
   27145        65576 :         || op_mode == QImode || op_mode == DFmode || op_mode == SFmode
   27146        32772 :         || op_mode == HFmode)
   27147        32806 :       || code == ORDERED
   27148        32806 :       || code == UNORDERED)
   27149              :     {
   27150        32770 :       end_sequence ();
   27151        32770 :       return NULL_RTX;
   27152              :     }
   27153              : 
   27154              :   /* Canonicalize the operands according to mode.  */
   27155        32806 :   if (SCALAR_INT_MODE_P (op_mode))
   27156              :     {
   27157        32799 :       if (!nonimmediate_operand (op0, op_mode))
   27158            0 :         op0 = force_reg (op_mode, op0);
   27159        32799 :       if (!x86_64_general_operand (op1, op_mode))
   27160            0 :         op1 = force_reg (op_mode, op1);
   27161              :     }
   27162              :   else
   27163              :     {
   27164              :       /* op0/op1 can be canonicallized from expand_fp_compare, so
   27165              :          just adjust the code to make it generate supported fp
   27166              :          condition.  */
   27167            7 :       if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
   27168              :         {
   27169              :           /* First try to split condition if we don't need to honor
   27170              :              NaNs, as the ORDERED/UNORDERED check always fall
   27171              :              through.  */
   27172            6 :           if (!HONOR_NANS (op_mode))
   27173              :             {
   27174            6 :               rtx_code first_code;
   27175            6 :               split_comparison (code, op_mode, &first_code, &code);
   27176              :             }
   27177              :           /* Otherwise try to swap the operand order and check if
   27178              :              the comparison is supported.  */
   27179              :           else
   27180              :             {
   27181            0 :               code = swap_condition (code);
   27182            0 :               std::swap (op0, op1);
   27183              :             }
   27184              : 
   27185            6 :           if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
   27186              :             {
   27187            0 :               end_sequence ();
   27188            0 :               return NULL_RTX;
   27189              :             }
   27190              :         }
   27191              :     }
   27192              : 
   27193        32806 :   *prep_seq = end_sequence ();
   27194              : 
   27195        32806 :   start_sequence ();
   27196              : 
   27197        32806 :   res = ix86_expand_compare (code, op0, op1);
   27198              : 
   27199        32806 :   if (!res)
   27200              :     {
   27201              :       end_sequence ();
   27202              :       return NULL_RTX;
   27203              :     }
   27204        32806 :   *gen_seq = end_sequence ();
   27205              : 
   27206        32806 :   return res;
   27207              : }
   27208              : 
   27209              : rtx
   27210        32809 : ix86_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
   27211              :                        rtx_code cmp_code, tree treeop0, tree treeop1,
   27212              :                        rtx_code bit_code)
   27213              : {
   27214        32809 :   if (!TARGET_APX_CCMP)
   27215              :     return NULL_RTX;
   27216              : 
   27217        32809 :   rtx op0, op1, target;
   27218        32809 :   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
   27219        32809 :   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
   27220        32809 :   insn_code icode;
   27221        32809 :   rtx_code prev_code;
   27222        32809 :   struct expand_operand ops[5];
   27223        32809 :   int dfv;
   27224              : 
   27225              :   /* Exit early for non integer modes to avoid O(n^2) part of expand_operands. */
   27226        32809 :   cmp_mode = op_mode = TYPE_MODE (TREE_TYPE (treeop0));
   27227              : 
   27228        32809 :   if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
   27229              :         || op_mode == QImode))
   27230              :     return NULL_RTX;
   27231              : 
   27232           32 :   push_to_sequence (*prep_seq);
   27233           32 :   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
   27234              : 
   27235           32 :   icode = code_for_ccmp (op_mode);
   27236              : 
   27237           32 :   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
   27238           32 :   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
   27239           32 :   if (!op0 || !op1)
   27240              :     {
   27241            0 :       end_sequence ();
   27242            0 :       return NULL_RTX;
   27243              :     }
   27244              : 
   27245           32 :   *prep_seq = end_sequence ();
   27246              : 
   27247           32 :   target = gen_rtx_REG (cc_mode, FLAGS_REG);
   27248           32 :   dfv = ix86_get_flags_cc ((rtx_code) cmp_code);
   27249              : 
   27250           32 :   prev_code = GET_CODE (prev);
   27251              :   /* Fixup FP compare code here.  */
   27252           32 :   if (GET_MODE (XEXP (prev, 0)) == CCFPmode)
   27253            7 :     prev_code = ix86_fp_compare_code_to_integer (prev_code);
   27254              : 
   27255           32 :   if (bit_code != AND)
   27256           17 :     prev_code = reverse_condition (prev_code);
   27257              :   else
   27258           15 :     dfv = (int)(dfv ^ 1);
   27259              : 
   27260           32 :   prev = gen_rtx_fmt_ee (prev_code, VOIDmode, XEXP (prev, 0),
   27261              :                          const0_rtx);
   27262              : 
   27263           32 :   create_fixed_operand (&ops[0], target);
   27264           32 :   create_fixed_operand (&ops[1], prev);
   27265           32 :   create_fixed_operand (&ops[2], op0);
   27266           32 :   create_fixed_operand (&ops[3], op1);
   27267           32 :   create_fixed_operand (&ops[4], GEN_INT (dfv));
   27268              : 
   27269           32 :   push_to_sequence (*gen_seq);
   27270           32 :   if (!maybe_expand_insn (icode, 5, ops))
   27271              :     {
   27272            0 :       end_sequence ();
   27273            0 :       return NULL_RTX;
   27274              :     }
   27275              : 
   27276           32 :   *gen_seq = end_sequence ();
   27277              : 
   27278           32 :   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
   27279              : }
   27280              : 
   27281              : /* Attempt to convert a CONST_VECTOR into a bcst_mem_operand.
   27282              :    Returns NULL_RTX if X is cannot be expressed as a suitable
   27283              :    VEC_DUPLICATE in mode MODE.  */
   27284              : 
   27285              : static rtx
   27286           48 : ix86_gen_bcst_mem (machine_mode mode, rtx x)
   27287              : {
   27288           48 :   if (!TARGET_AVX512F
   27289           48 :       || !CONST_VECTOR_P (x)
   27290           64 :       || (!TARGET_AVX512VL && GET_MODE_SIZE (mode) != 64)
   27291          147 :       || !VALID_BCST_MODE_P (GET_MODE_INNER (mode))
   27292              :          /* Disallow HFmode broadcast.  */
   27293          126 :       || GET_MODE_SIZE (GET_MODE_INNER (mode)) < 4)
   27294              :     return NULL_RTX;
   27295              : 
   27296           21 :   rtx cst = CONST_VECTOR_ELT (x, 0);
   27297           21 :   if (!CONST_SCALAR_INT_P (cst)
   27298           15 :       && !CONST_DOUBLE_P (cst)
   27299            0 :       && !CONST_FIXED_P (cst))
   27300              :     return NULL_RTX;
   27301              : 
   27302           21 :   int n_elts = GET_MODE_NUNITS (mode);
   27303           42 :   if (CONST_VECTOR_NUNITS (x) != n_elts)
   27304              :     return NULL_RTX;
   27305              : 
   27306          150 :   for (int i = 1; i < n_elts; i++)
   27307          129 :     if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i)))
   27308              :       return NULL_RTX;
   27309              : 
   27310           42 :   rtx mem = force_const_mem (GET_MODE_INNER (mode), cst);
   27311           21 :   return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem));
   27312              : }
   27313              : 
   27314              : /* Determine the ternlog immediate index that implements 3-operand
   27315              :    ternary logic expression OP.  This uses and modifies the 3 element
   27316              :    array ARGS to record and check the leaves, either 3 REGs, or 2 REGs
   27317              :    and MEM.  Returns an index between 0 and 255 for a valid ternlog,
   27318              :    or -1 if the expression isn't suitable.  */
   27319              : 
   27320              : int
   27321      7280332 : ix86_ternlog_idx (rtx op, rtx *args)
   27322              : {
   27323      7280332 :   int idx0, idx1;
   27324              : 
   27325      7280332 :   if (!op)
   27326              :     return -1;
   27327              : 
   27328      7280332 :   switch (GET_CODE (op))
   27329              :     {
   27330       751232 :     case SUBREG:
   27331       751232 :       if (!register_operand (op, GET_MODE (op)))
   27332              :         return -1;
   27333              :       /* FALLTHRU */
   27334              : 
   27335      3565035 :     case REG:
   27336      3565035 :       if (!args[0])
   27337              :         {
   27338      1848599 :           args[0] = op;
   27339      1848599 :           return 0xf0;
   27340              :         }
   27341      1716436 :       if (rtx_equal_p (op, args[0]))
   27342              :         return 0xf0;
   27343      1689839 :       if (!args[1])
   27344              :         {
   27345      1425231 :           args[1] = op;
   27346      1425231 :           return 0xcc;
   27347              :         }
   27348       264608 :       if (rtx_equal_p (op, args[1]))
   27349              :         return 0xcc;
   27350       248036 :       if (!args[2])
   27351              :         {
   27352       225700 :           args[2] = op;
   27353       225700 :           return 0xaa;
   27354              :         }
   27355        22336 :       if (rtx_equal_p (op, args[2]))
   27356              :         return 0xaa;
   27357              :       return -1;
   27358              : 
   27359        17708 :     case VEC_DUPLICATE:
   27360        17708 :       if (!bcst_mem_operand (op, GET_MODE (op)))
   27361              :         return -1;
   27362          302 :       goto do_mem_operand;
   27363              : 
   27364       365347 :     case MEM:
   27365       365347 :       if (!memory_operand (op, GET_MODE (op)))
   27366              :         return -1;
   27367       365182 :       if (MEM_P (op)
   27368       365182 :           && MEM_VOLATILE_P (op)
   27369       365276 :           && !volatile_ok)
   27370              :         return -1;
   27371              :       /* FALLTHRU */
   27372              : 
   27373       473669 :     case CONST_VECTOR:
   27374       473669 : do_mem_operand:
   27375       473669 :       if (!args[2])
   27376              :         {
   27377       426415 :           args[2] = op;
   27378       426415 :           return 0xaa;
   27379              :         }
   27380              :       /* Maximum of one volatile memory reference per expression.  */
   27381        47254 :       if (side_effects_p (op))
   27382              :         return -1;
   27383        47254 :       if (rtx_equal_p (op, args[2]))
   27384              :         return 0xaa;
   27385              :       /* Check if CONST_VECTOR is the ones-complement of args[2].  */
   27386        47203 :       if (CONST_VECTOR_P (op)
   27387         3446 :           && CONST_VECTOR_P (args[2])
   27388        47448 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27389          245 :                                                           op, GET_MODE (op)),
   27390              :                           args[2]))
   27391              :         return 0x55;
   27392        47016 :       if (!args[0])
   27393              :         {
   27394        45218 :           args[0] = op;
   27395        45218 :           return 0xf0;
   27396              :         }
   27397         1798 :       if (rtx_equal_p (op, args[0]))
   27398              :         return 0xf0;
   27399              :       /* Check if CONST_VECTOR is the ones-complement of args[0].  */
   27400         1798 :       if (CONST_VECTOR_P (op)
   27401          101 :           && CONST_VECTOR_P (args[0])
   27402         1840 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27403           42 :                                                           op, GET_MODE (op)),
   27404              :                           args[0]))
   27405              :         return 0x0f;
   27406         1756 :       if (!args[1])
   27407              :         {
   27408         1744 :           args[1] = op;
   27409         1744 :           return 0xcc;
   27410              :         }
   27411           12 :       if (rtx_equal_p (op, args[1]))
   27412              :         return 0xcc;
   27413              :       /* Check if CONST_VECTOR is the ones-complement of args[1].  */
   27414           12 :       if (CONST_VECTOR_P (op)
   27415            0 :           && CONST_VECTOR_P (args[1])
   27416           12 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27417            0 :                                                           op, GET_MODE (op)),
   27418              :                           args[1]))
   27419              :         return 0x33;
   27420              :       return -1;
   27421              : 
   27422       185167 :     case NOT:
   27423       185167 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27424       185167 :       return (idx0 >= 0) ? idx0 ^ 0xff : -1;
   27425              : 
   27426      1301572 :     case AND:
   27427      1301572 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27428      1301572 :       if (idx0 < 0)
   27429              :         return -1;
   27430      1071839 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27431      1071839 :       return (idx1 >= 0) ? idx0 & idx1 : -1;
   27432              : 
   27433       953110 :     case IOR:
   27434       953110 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27435       953110 :       if (idx0 < 0)
   27436              :         return -1;
   27437       708705 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27438       708705 :       return (idx1 >= 0) ? idx0 | idx1 : -1;
   27439              : 
   27440       402407 :     case XOR:
   27441       402407 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27442       402407 :       if (idx0 < 0)
   27443              :         return -1;
   27444       383203 :       if (vector_all_ones_operand (XEXP (op, 1), GET_MODE (op)))
   27445         6671 :         return idx0 ^ 0xff;
   27446       376532 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27447       376532 :       return (idx1 >= 0) ? idx0 ^ idx1 : -1;
   27448              : 
   27449         7198 :     case UNSPEC:
   27450         7198 :       if (XINT (op, 1) != UNSPEC_VTERNLOG
   27451            0 :           || XVECLEN (op, 0) != 4
   27452            0 :           || !CONST_INT_P (XVECEXP (op, 0, 3)))
   27453              :         return -1;
   27454              : 
   27455              :       /* TODO: Handle permuted operands.  */
   27456            0 :       if (ix86_ternlog_idx (XVECEXP (op, 0, 0), args) != 0xf0
   27457            0 :           || ix86_ternlog_idx (XVECEXP (op, 0, 1), args) != 0xcc
   27458            0 :           || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa)
   27459            0 :         return -1;
   27460            0 :       return INTVAL (XVECEXP (op, 0, 3));
   27461              : 
   27462              :     default:
   27463              :       return -1;
   27464              :     }
   27465              : }
   27466              : 
   27467              : /* Return TRUE if OP (in mode MODE) is the leaf of a ternary logic
   27468              :    expression, such as a register or a memory reference.  */
   27469              : 
   27470              : bool
   27471      3377536 : ix86_ternlog_leaf_p (rtx op, machine_mode mode)
   27472              : {
   27473              :   /* We can't use memory_operand here, as it may return a different
   27474              :      value before and after reload (for volatile MEMs) which creates
   27475              :      problems splitting instructions.  */
   27476      3377536 :   return register_operand (op, mode)
   27477       735344 :          || MEM_P (op)
   27478       384059 :          || CONST_VECTOR_P (op)
   27479      3659351 :          || bcst_mem_operand (op, mode);
   27480              : }
   27481              : 
   27482              : /* Test whether OP is a 3-operand ternary logic expression suitable
   27483              :    for use in a ternlog instruction.  */
   27484              : 
   27485              : bool
   27486      2245014 : ix86_ternlog_operand_p (rtx op)
   27487              : {
   27488      2245014 :   rtx op0, op1;
   27489      2245014 :   rtx args[3];
   27490              : 
   27491      2245014 :   args[0] = NULL_RTX;
   27492      2245014 :   args[1] = NULL_RTX;
   27493      2245014 :   args[2] = NULL_RTX;
   27494      2245014 :   int idx = ix86_ternlog_idx (op, args);
   27495      2245014 :   if (idx < 0)
   27496              :     return false;
   27497              : 
   27498              :   /* Don't match simple (binary or unary) expressions.  */
   27499      1824755 :   machine_mode mode = GET_MODE (op);
   27500      1824755 :   switch (GET_CODE (op))
   27501              :     {
   27502       843125 :     case AND:
   27503       843125 :       op0 = XEXP (op, 0);
   27504       843125 :       op1 = XEXP (op, 1);
   27505              : 
   27506              :       /* Prefer pand.  */
   27507       843125 :       if (ix86_ternlog_leaf_p (op0, mode)
   27508       843125 :           && ix86_ternlog_leaf_p (op1, mode))
   27509              :         return false;
   27510              :       /* Prefer pandn.  */
   27511       109040 :       if (GET_CODE (op0) == NOT
   27512        77461 :           && register_operand (XEXP (op0, 0), mode)
   27513       182908 :           && ix86_ternlog_leaf_p (op1, mode))
   27514              :         return false;
   27515              :       break;
   27516              : 
   27517       622274 :     case IOR:
   27518              :       /* Prefer por.  */
   27519       622274 :       if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
   27520       622274 :           && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
   27521              :         return false;
   27522              :       break;
   27523              : 
   27524       326490 :     case XOR:
   27525       326490 :       op1 = XEXP (op, 1);
   27526              :       /* Prefer pxor, or one_cmpl<vmode>2.  */
   27527       326490 :       if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
   27528       326490 :           && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
   27529              :         return false;
   27530              :       break;
   27531              : 
   27532              :     default:
   27533              :       break;
   27534              :     }
   27535              :   return true;
   27536              : }
   27537              : 
   27538              : /* Helper function for ix86_expand_ternlog.  */
   27539              : static rtx
   27540            0 : ix86_expand_ternlog_binop (enum rtx_code code, machine_mode mode,
   27541              :                            rtx op0, rtx op1, rtx target)
   27542              : {
   27543            0 :   if (GET_MODE (op0) != mode)
   27544            0 :     op0 = gen_lowpart (mode, op0);
   27545            0 :   if (GET_MODE (op1) != mode)
   27546            0 :     op1 = gen_lowpart (mode, op1);
   27547              : 
   27548            0 :   if (CONST_VECTOR_P (op0))
   27549            0 :     op0 = validize_mem (force_const_mem (mode, op0));
   27550            0 :   if (CONST_VECTOR_P (op1))
   27551            0 :     op1 = validize_mem (force_const_mem (mode, op1));
   27552              : 
   27553            0 :   if (!register_operand (op0, mode))
   27554              :     {
   27555            0 :       if (!register_operand (op1, mode))
   27556              :         {
   27557              :           /* We can't use force_reg (op0, mode).  */
   27558            0 :           rtx reg = gen_reg_rtx (mode);
   27559            0 :           emit_move_insn (reg, op0);
   27560            0 :           op0 = reg;
   27561              :         }
   27562              :       else
   27563              :         std::swap (op0, op1);
   27564              :     }
   27565            0 :   rtx ops[3] = { target, op0, op1 };
   27566            0 :   ix86_expand_vector_logical_operator (code, mode, ops);
   27567            0 :   return target;
   27568              : }
   27569              : 
   27570              : 
   27571              : /* Helper function for ix86_expand_ternlog.  */
   27572              : static rtx
   27573            0 : ix86_expand_ternlog_andnot (machine_mode mode, rtx op0, rtx op1, rtx target)
   27574              : {
   27575            0 :   if (GET_MODE (op0) != mode)
   27576            0 :     op0 = gen_lowpart (mode, op0);
   27577            0 :   op0 = gen_rtx_NOT (mode, op0);
   27578            0 :   if (GET_MODE (op1) != mode)
   27579            0 :     op1 = gen_lowpart (mode, op1);
   27580            0 :   if (CONST_VECTOR_P (op1))
   27581            0 :     op1 = validize_mem (force_const_mem (mode, op1));
   27582            0 :   emit_move_insn (target, gen_rtx_AND (mode, op0, op1));
   27583            0 :   return target;
   27584              : }
   27585              : 
   27586              : /* Expand a 3-operand ternary logic expression.  Return TARGET. */
   27587              : rtx
   27588         2420 : ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
   27589              :                      rtx target)
   27590              : {
   27591         2420 :   rtx tmp0, tmp1, tmp2;
   27592              : 
   27593         2420 :   if (!target)
   27594            3 :     target = gen_reg_rtx (mode);
   27595              : 
   27596              :   /* Canonicalize ternlog index for degenerate (duplicated) operands.  */
   27597         2420 :   if (rtx_equal_p (op0, op1) && rtx_equal_p (op0, op2))
   27598            0 :     switch (idx & 0x81)
   27599              :       {
   27600              :       case 0x00:
   27601              :         idx = 0x00;
   27602              :         break;
   27603              :       case 0x01:
   27604              :         idx = 0x0f;
   27605              :         break;
   27606              :       case 0x80:
   27607              :         idx = 0xf0;
   27608              :         break;
   27609              :       case 0x81:
   27610              :         idx = 0xff;
   27611              :         break;
   27612              :       }
   27613              : 
   27614         2420 :   switch (idx & 0xff)
   27615              :     {
   27616            0 :     case 0x00:
   27617            0 :       if ((!op0 || !side_effects_p (op0))
   27618            0 :           && (!op1 || !side_effects_p (op1))
   27619            0 :           && (!op2 || !side_effects_p (op2)))
   27620              :         {
   27621            0 :           emit_move_insn (target, CONST0_RTX (mode));
   27622            0 :           return target;
   27623              :         }
   27624              :       break;
   27625              : 
   27626            0 :     case 0x0a: /* ~a&c */
   27627            0 :       if ((!op1 || !side_effects_p (op1))
   27628            0 :           && op0 && register_operand (op0, mode)
   27629            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27630            0 :         return ix86_expand_ternlog_andnot (mode, op0, op2, target);
   27631              :       break;
   27632              : 
   27633            0 :     case 0x0c: /* ~a&b */
   27634            0 :       if ((!op2 || !side_effects_p (op2))
   27635            0 :           && op0 && register_operand (op0, mode)
   27636            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode))
   27637            0 :         return ix86_expand_ternlog_andnot (mode, op0, op1, target);
   27638              :       break;
   27639              : 
   27640           78 :     case 0x0f:  /* ~a */
   27641            0 :       if ((!op1 || !side_effects_p (op1))
   27642           78 :           && (!op2 || !side_effects_p (op2))
   27643          156 :           && op0)
   27644              :         {
   27645           78 :           emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode)));
   27646           78 :           return target;
   27647              :         }
   27648              :       break;
   27649              : 
   27650            0 :     case 0x22: /* ~b&c */
   27651            0 :       if ((!op0 || !side_effects_p (op0))
   27652            0 :           && op1 && register_operand (op1, mode)
   27653            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27654            0 :         return ix86_expand_ternlog_andnot (mode, op1, op2, target);
   27655              :       break;
   27656              : 
   27657            0 :     case 0x30: /* ~b&a */
   27658            0 :       if ((!op2 || !side_effects_p (op2))
   27659            0 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27660            0 :           && op1 && register_operand (op1, mode))
   27661            0 :         return ix86_expand_ternlog_andnot (mode, op1, op0, target);
   27662              :       break;
   27663              : 
   27664            0 :     case 0x33:  /* ~b */
   27665            0 :       if ((!op0 || !side_effects_p (op0))
   27666            0 :           && (!op2 || !side_effects_p (op2))
   27667            0 :           && op1)
   27668              :         {
   27669            0 :           emit_move_insn (target, gen_rtx_XOR (mode, op1, CONSTM1_RTX (mode)));
   27670            0 :           return target;
   27671              :         }
   27672              :       break;
   27673              : 
   27674            0 :     case 0x3c:  /* a^b */
   27675            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27676            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27677            0 :           && (!op2 || !side_effects_p (op2)))
   27678            0 :         return ix86_expand_ternlog_binop (XOR, mode, op0, op1, target);
   27679              :       break;
   27680              : 
   27681            0 :     case 0x44: /* ~c&b */
   27682            0 :       if ((!op0 || !side_effects_p (op0))
   27683            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27684            0 :           && op2 && register_operand (op2, mode))
   27685            0 :         return ix86_expand_ternlog_andnot (mode, op2, op1, target);
   27686              :       break;
   27687              : 
   27688            2 :     case 0x50: /* ~c&a */
   27689            0 :       if ((!op1 || !side_effects_p (op1))
   27690            2 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27691            4 :           && op2 && register_operand (op2, mode))
   27692            0 :         return ix86_expand_ternlog_andnot (mode, op2, op0, target);
   27693              :       break;
   27694              : 
   27695            4 :     case 0x55:  /* ~c */
   27696            1 :       if ((!op0 || !side_effects_p (op0))
   27697            4 :           && (!op1 || !side_effects_p (op1))
   27698            8 :           && op2)
   27699              :         {
   27700            4 :           emit_move_insn (target, gen_rtx_XOR (mode, op2, CONSTM1_RTX (mode)));
   27701            4 :           return target;
   27702              :         }
   27703              :       break;
   27704              : 
   27705            0 :     case 0x5a:  /* a^c */
   27706            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27707            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode)
   27708            0 :           && (!op1 || !side_effects_p (op1)))
   27709            0 :         return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target);
   27710              :       break;
   27711              : 
   27712            0 :     case 0x66:  /* b^c */
   27713            0 :       if ((!op0 || !side_effects_p (op0))
   27714            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27715            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27716            0 :         return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target);
   27717              :       break;
   27718              : 
   27719            0 :     case 0x88:  /* b&c */
   27720            0 :       if ((!op0 || !side_effects_p (op0))
   27721            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27722            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27723            0 :         return ix86_expand_ternlog_binop (AND, mode, op1, op2, target);
   27724              :       break;
   27725              : 
   27726            0 :     case 0xa0:  /* a&c */
   27727            0 :       if ((!op1 || !side_effects_p (op1))
   27728            0 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27729            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27730            0 :         return ix86_expand_ternlog_binop (AND, mode, op0, op2, target);
   27731              :       break;
   27732              : 
   27733            0 :     case 0xaa:  /* c */
   27734            0 :       if ((!op0 || !side_effects_p (op0))
   27735            0 :           && (!op1 || !side_effects_p (op1))
   27736            0 :           && op2)
   27737              :         {
   27738            0 :           if (GET_MODE (op2) != mode)
   27739            0 :             op2 = gen_lowpart (mode, op2);
   27740            0 :           emit_move_insn (target, op2);
   27741            0 :           return target;
   27742              :         }
   27743              :       break;
   27744              : 
   27745            0 :     case 0xc0:  /* a&b */
   27746            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27747            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27748            0 :           && (!op2 || !side_effects_p (op2)))
   27749            0 :         return ix86_expand_ternlog_binop (AND, mode, op0, op1, target);
   27750              :       break;
   27751              : 
   27752            0 :     case 0xcc:  /* b */
   27753            0 :       if ((!op0 || !side_effects_p (op0))
   27754            0 :           && op1
   27755            0 :           && (!op2 || !side_effects_p (op2)))
   27756              :         {
   27757            0 :           if (GET_MODE (op1) != mode)
   27758            0 :             op1 = gen_lowpart (mode, op1);
   27759            0 :           emit_move_insn (target, op1);
   27760            0 :           return target;
   27761              :         }
   27762              :       break;
   27763              : 
   27764            0 :     case 0xee:  /* b|c */
   27765            0 :       if ((!op0 || !side_effects_p (op0))
   27766            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27767            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27768            0 :         return ix86_expand_ternlog_binop (IOR, mode, op1, op2, target);
   27769              :       break;
   27770              : 
   27771            6 :     case 0xf0:  /* a */
   27772            6 :       if (op0
   27773            6 :           && (!op1 || !side_effects_p (op1))
   27774           12 :           && (!op2 || !side_effects_p (op2)))
   27775              :         {
   27776            6 :           if (GET_MODE (op0) != mode)
   27777            0 :             op0 = gen_lowpart (mode, op0);
   27778            6 :           emit_move_insn (target, op0);
   27779            6 :           return target;
   27780              :         }
   27781              :       break;
   27782              : 
   27783            0 :     case 0xfa:  /* a|c */
   27784            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27785            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode)
   27786            0 :           && (!op1 || !side_effects_p (op1)))
   27787            0 :         return ix86_expand_ternlog_binop (IOR, mode, op0, op2, target);
   27788              :       break;
   27789              : 
   27790            0 :     case 0xfc:  /* a|b */
   27791            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27792            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27793            0 :           && (!op2 || !side_effects_p (op2)))
   27794            0 :         return ix86_expand_ternlog_binop (IOR, mode, op0, op1, target);
   27795              :       break;
   27796              : 
   27797            0 :     case 0xff:
   27798            0 :       if ((!op0 || !side_effects_p (op0))
   27799            0 :           && (!op1 || !side_effects_p (op1))
   27800            0 :           && (!op2 || !side_effects_p (op2)))
   27801              :         {
   27802            0 :           emit_move_insn (target, CONSTM1_RTX (mode));
   27803            0 :           return target;
   27804              :         }
   27805              :       break;
   27806              :     }
   27807              : 
   27808         2332 :   if (!register_operand (op0, mode))
   27809              :     {
   27810              :       /* We can't use force_reg (mode, op0).  */
   27811           12 :       tmp0 = gen_reg_rtx (GET_MODE (op0));
   27812           12 :       emit_move_insn (tmp0,op0);
   27813              :     }
   27814              :   else
   27815              :     tmp0 = op0;
   27816         2332 :   if (GET_MODE (tmp0) != mode)
   27817            0 :     tmp0 = gen_lowpart (mode, tmp0);
   27818              : 
   27819         2332 :   if (!op1 || rtx_equal_p (op0, op1))
   27820            6 :     tmp1 = copy_rtx (tmp0);
   27821         2326 :   else if (!register_operand (op1, mode))
   27822              :     {
   27823              :       /* We can't use force_reg (mode, op1).  */
   27824           28 :       tmp1 = gen_reg_rtx (GET_MODE (op1));
   27825           28 :       emit_move_insn (tmp1, op1);
   27826              :     }
   27827              :   else
   27828              :     tmp1 = op1;
   27829         2332 :   if (GET_MODE (tmp1) != mode)
   27830            0 :     tmp1 = gen_lowpart (mode, tmp1);
   27831              : 
   27832         2332 :   if (!op2 || rtx_equal_p (op0, op2))
   27833           75 :     tmp2 = copy_rtx (tmp0);
   27834         2257 :   else if (rtx_equal_p (op1, op2))
   27835            0 :     tmp2 = copy_rtx (tmp1);
   27836         2257 :   else if (CONST_VECTOR_P (op2))
   27837              :     {
   27838           43 :       if (GET_MODE (op2) != mode)
   27839            0 :         op2 = gen_lowpart (mode, op2);
   27840           43 :       tmp2 = ix86_gen_bcst_mem (mode, op2);
   27841           43 :       if (!tmp2)
   27842              :         {
   27843           25 :           machine_mode bcst32_mode = mode;
   27844           25 :           machine_mode bcst64_mode = mode;
   27845           25 :           switch (mode)
   27846              :             {
   27847            1 :             case V1TImode:
   27848            1 :             case V4SImode:
   27849            1 :             case V4SFmode:
   27850            1 :             case V8HImode:
   27851            1 :             case V16QImode:
   27852            1 :               bcst32_mode = V4SImode;
   27853            1 :               bcst64_mode = V2DImode;
   27854            1 :               break;
   27855              : 
   27856            0 :             case V2TImode:
   27857            0 :             case V8SImode:
   27858            0 :             case V8SFmode:
   27859            0 :             case V16HImode:
   27860            0 :             case V32QImode:
   27861            0 :               bcst32_mode = V8SImode;
   27862            0 :               bcst64_mode = V4DImode;
   27863            0 :               break;
   27864              : 
   27865            3 :             case V4TImode:
   27866            3 :             case V16SImode:
   27867            3 :             case V16SFmode:
   27868            3 :             case V32HImode:
   27869            3 :             case V64QImode:
   27870            3 :               bcst32_mode = V16SImode;
   27871            3 :               bcst64_mode = V8DImode;
   27872            3 :               break;
   27873              : 
   27874              :             default:
   27875              :               break;
   27876              :             }
   27877              : 
   27878           25 :           if (bcst32_mode != mode)
   27879              :             {
   27880            4 :               tmp2 = gen_lowpart (bcst32_mode, op2);
   27881            4 :               if (ix86_gen_bcst_mem (bcst32_mode, tmp2))
   27882              :                 {
   27883            3 :                   tmp2 = ix86_expand_ternlog (bcst32_mode,
   27884            3 :                                               gen_lowpart (bcst32_mode, tmp0),
   27885            3 :                                               gen_lowpart (bcst32_mode, tmp1),
   27886              :                                               tmp2, idx, NULL_RTX);
   27887            3 :                   emit_move_insn (target, gen_lowpart (mode, tmp2));
   27888            3 :                   return target;
   27889              :                 }
   27890              :             }
   27891              : 
   27892           22 :           if (bcst64_mode != mode)
   27893              :             {
   27894            1 :               tmp2 = gen_lowpart (bcst64_mode, op2);
   27895            1 :               if (ix86_gen_bcst_mem (bcst64_mode, tmp2))
   27896              :                 {
   27897            0 :                   tmp2 = ix86_expand_ternlog (bcst64_mode,
   27898            0 :                                               gen_lowpart (bcst64_mode, tmp0),
   27899            0 :                                               gen_lowpart (bcst64_mode, tmp1),
   27900              :                                               tmp2, idx, NULL_RTX);
   27901            0 :                   emit_move_insn (target, gen_lowpart (mode, tmp2));
   27902            0 :                   return target;
   27903              :                 }
   27904              :             }
   27905              : 
   27906           22 :           tmp2 = force_const_mem (mode, op2);
   27907           22 :           rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
   27908           22 :           tmp2 = validize_mem (tmp2);
   27909           22 :           if (bcast)
   27910              :             {
   27911           12 :               rtx reg2 = gen_reg_rtx (mode);
   27912           12 :               bool ok = ix86_expand_vector_init_duplicate (false, mode,
   27913              :                                                            reg2, bcast);
   27914           12 :               if (ok)
   27915         2329 :                 tmp2 = reg2;
   27916              :             }
   27917              :         }
   27918              :     }
   27919              :   else
   27920              :     tmp2 = op2;
   27921         2329 :   if (GET_MODE (tmp2) != mode)
   27922            0 :     tmp2 = gen_lowpart (mode, tmp2);
   27923              :   /* Some memory_operands are not vector_memory_operands.  */
   27924         2329 :   if (!bcst_vector_operand (tmp2, mode))
   27925            0 :     tmp2 = force_reg (mode, tmp2);
   27926              : 
   27927         2329 :   rtvec vec = gen_rtvec (4, tmp0, tmp1, tmp2, GEN_INT (idx));
   27928         2329 :   emit_move_insn (target, gen_rtx_UNSPEC (mode, vec, UNSPEC_VTERNLOG));
   27929         2329 :   return target;
   27930              : }
   27931              : 
   27932              : /* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
   27933              : 
   27934              : static const uint64_t matrix_ashift[8] =
   27935              : {
   27936              :   0,
   27937              :   0x0001020408102040, /* 1 l */
   27938              :   0x0000010204081020, /* 2 l */
   27939              :   0x0000000102040810, /* 3 l */
   27940              :   0x0000000001020408, /* 4 l */
   27941              :   0x0000000000010204, /* 5 l */
   27942              :   0x0000000000000102, /* 6 l */
   27943              :   0x0000000000000001  /* 7 l */
   27944              : };
   27945              : 
   27946              : static const uint64_t matrix_lshiftrt[8] =
   27947              : {
   27948              :   0,
   27949              :   0x0204081020408000, /* 1 r */
   27950              :   0x0408102040800000, /* 2 r */
   27951              :   0x0810204080000000, /* 3 r */
   27952              :   0x1020408000000000, /* 4 r */
   27953              :   0x2040800000000000, /* 5 r */
   27954              :   0x4080000000000000, /* 6 r */
   27955              :   0x8000000000000000  /* 7 r */
   27956              : };
   27957              : 
   27958              : static const uint64_t matrix_ashiftrt[8] =
   27959              : {
   27960              :   0,
   27961              :   0x0204081020408080, /* 1 r */
   27962              :   0x0408102040808080, /* 2 r */
   27963              :   0x0810204080808080, /* 3 r */
   27964              :   0x1020408080808080, /* 4 r */
   27965              :   0x2040808080808080, /* 5 r */
   27966              :   0x4080808080808080, /* 6 r */
   27967              :   0x8080808080808080  /* 7 r */
   27968              : };
   27969              : 
   27970              : static const uint64_t matrix_rotate[8] =
   27971              : {
   27972              :   0,
   27973              :   0x8001020408102040, /* 1 rol8 */
   27974              :   0x4080010204081020, /* 2 rol8 */
   27975              :   0x2040800102040810, /* 3 rol8 */
   27976              :   0x1020408001020408, /* 4 rol8 */
   27977              :   0x0810204080010204, /* 5 rol8 */
   27978              :   0x0408102040800102, /* 6 rol8 */
   27979              :   0x0204081020408001  /* 7 rol8 */
   27980              : };
   27981              : 
   27982              : static const uint64_t matrix_rotatert[8] =
   27983              : {
   27984              :   0,
   27985              :   0x0204081020408001, /* 1 ror8 */
   27986              :   0x0408102040800102, /* 2 ror8 */
   27987              :   0x0810204080010204, /* 3 ror8 */
   27988              :   0x1020408001020408, /* 4 ror8 */
   27989              :   0x2040800102040810, /* 5 ror8 */
   27990              :   0x4080010204081020, /* 6 ror8 */
   27991              :   0x8001020408102040  /* 7 ror8 */
   27992              : };
   27993              : 
   27994              : /* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
   27995              :    for CODE and shift count COUNT into register with vector of size of SRC.  */
   27996              : 
   27997              : rtx
   27998          202 : ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
   27999              : {
   28000          202 :   machine_mode mode = GET_MODE (src);
   28001          202 :   const uint64_t *matrix;
   28002          202 :   unsigned shift = INTVAL (count) & 7;
   28003          202 :   gcc_assert (shift > 0 && shift < 8);
   28004              : 
   28005          202 :   switch (code)
   28006              :     {
   28007              :     case ASHIFT:
   28008              :       matrix = matrix_ashift;
   28009              :       break;
   28010           27 :     case ASHIFTRT:
   28011           27 :       matrix = matrix_ashiftrt;
   28012           27 :       break;
   28013           30 :     case LSHIFTRT:
   28014           30 :       matrix = matrix_lshiftrt;
   28015           30 :       break;
   28016           34 :     case ROTATE:
   28017           34 :       matrix = matrix_rotate;
   28018           34 :       break;
   28019           35 :     case ROTATERT:
   28020           35 :       matrix = matrix_rotatert;
   28021           35 :       break;
   28022            0 :     default:
   28023            0 :       gcc_unreachable ();
   28024              :     }
   28025              : 
   28026          202 :   int nelts = GET_MODE_NUNITS (mode);
   28027          202 :   rtvec vec = rtvec_alloc (nelts);
   28028          202 :   uint64_t ma = matrix[shift];
   28029         6922 :   for (int i = 0; i < nelts; i++)
   28030         6720 :     RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
   28031              : 
   28032          202 :   return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
   28033              : }
   28034              : 
   28035              : /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
   28036              : 
   28037              : void
   28038           63 : ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode)
   28039              : {
   28040           63 :   machine_mode out_mode = GET_MODE (output);
   28041           63 :   machine_mode in_mode = GET_MODE (input);
   28042           63 :   int len = GET_MODE_SIZE (in_mode);
   28043          252 :   gcc_assert (len == GET_MODE_SIZE (cvt_mode)
   28044              :               && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
   28045              :               && (REG_P (input) || SUBREG_P (input)));
   28046           63 :   scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
   28047          126 :   int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
   28048           63 :   int out_innersize = GET_MODE_SIZE (inner_out_mode);
   28049              : 
   28050           63 :   struct expand_vec_perm_d d;
   28051           63 :   d.target = gen_reg_rtx (cvt_mode);
   28052           63 :   d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
   28053           63 :   d.op1 = d.op0;
   28054           63 :   d.vmode = cvt_mode;
   28055           63 :   d.nelt = GET_MODE_NUNITS (cvt_mode);
   28056           63 :   d.testing_p = false;
   28057           63 :   d.one_operand_p = true;
   28058              : 
   28059              :   /* Init perm. Put the needed bits of input in order and
   28060              :      fill the rest of bits by default.  */
   28061          687 :   for (int i = 0; i < d.nelt; ++i)
   28062              :     {
   28063          624 :       d.perm[i] = i;
   28064         1248 :       if (i < GET_MODE_NUNITS (out_mode))
   28065          246 :         d.perm[i] = i * (in_innersize / out_innersize);
   28066              :     }
   28067              : 
   28068           63 :   bool ok = ix86_expand_vec_perm_const_1(&d);
   28069           63 :   gcc_assert (ok);
   28070           63 :   emit_move_insn (output, gen_lowpart (out_mode, d.target));
   28071           63 : }
   28072              : 
   28073              : /* Implement truncv8sfv8bf2 with vector permutation.  */
   28074              : void
   28075            8 : ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
   28076              : {
   28077            8 :   machine_mode vperm_mode, src_mode = GET_MODE (src);
   28078            8 :   switch (src_mode)
   28079              :     {
   28080              :     case V16SFmode:
   28081              :       vperm_mode = V32BFmode;
   28082              :       break;
   28083            2 :     case V8SFmode:
   28084            2 :       vperm_mode = V16BFmode;
   28085            2 :       break;
   28086            4 :     case V4SFmode:
   28087            4 :       vperm_mode = V8BFmode;
   28088            4 :       break;
   28089            0 :     default:
   28090            0 :       gcc_unreachable ();
   28091              :     }
   28092              : 
   28093            8 :   int nelt = GET_MODE_NUNITS (vperm_mode);
   28094            8 :   vec_perm_builder sel (nelt, nelt, 1);
   28095            8 :   sel.quick_grow (nelt);
   28096          136 :   for (int i = 0; i != nelt; i++)
   28097          128 :     sel[i] = (2 * i + 1) % nelt;
   28098           16 :   vec_perm_indices indices (sel, 1, nelt);
   28099              : 
   28100            8 :   rtx target = gen_reg_rtx (vperm_mode);
   28101            8 :   rtx op0 = lowpart_subreg (vperm_mode,
   28102              :                             force_reg (src_mode, src),
   28103              :                             src_mode);
   28104            8 :   bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
   28105              :                                               target, op0, op0, indices);
   28106            8 :   gcc_assert (ok);
   28107            8 :   emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
   28108            8 : }
   28109              : 
   28110              : /* Implement extendv8bf2v8sf2 with vector permutation.  */
   28111              : void
   28112            8 : ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src)
   28113              : {
   28114            8 :   machine_mode vperm_mode, src_mode = GET_MODE (src);
   28115            8 :   switch (src_mode)
   28116              :     {
   28117              :     case V16BFmode:
   28118              :       vperm_mode = V32BFmode;
   28119              :       break;
   28120            2 :     case V8BFmode:
   28121            2 :       vperm_mode = V16BFmode;
   28122            2 :       break;
   28123            4 :     case V4BFmode:
   28124            4 :       vperm_mode = V8BFmode;
   28125            4 :       break;
   28126            0 :     default:
   28127            0 :       gcc_unreachable ();
   28128              :     }
   28129              : 
   28130            8 :   int nelt = GET_MODE_NUNITS (vperm_mode);
   28131            8 :   vec_perm_builder sel (nelt, nelt, 1);
   28132            8 :   sel.quick_grow (nelt);
   28133          136 :   for (int i = 0, k = 0, j = nelt; i != nelt; i++)
   28134          128 :     sel[i] = i & 1 ? j++ : k++;
   28135              : 
   28136           16 :   vec_perm_indices indices (sel, 2, nelt);
   28137              : 
   28138            8 :   rtx target = gen_reg_rtx (vperm_mode);
   28139            8 :   rtx op1 = lowpart_subreg (vperm_mode,
   28140              :                             force_reg (src_mode, src),
   28141              :                             src_mode);
   28142            8 :   rtx op0 = CONST0_RTX (vperm_mode);
   28143            8 :   bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
   28144              :                                               target, op0, op1, indices);
   28145            8 :   gcc_assert (ok);
   28146            8 :   emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
   28147            8 : }
   28148              : 
   28149              : /* Implement bitreverse<mode>2 using gf2p8affineqb.  */
   28150              : 
   28151              : void
   28152            5 : ix86_expand_gfni_bitreverse (rtx dest, rtx src)
   28153              : {
   28154            5 :   machine_mode mode = GET_MODE (dest);
   28155            5 :   rtx temp;
   28156           10 :   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
   28157              :     {
   28158            1 :       rtx temp1 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode);
   28159            1 :       rtx temp2 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode);
   28160            1 :       if (mode == TImode)
   28161              :         {
   28162            1 :           temp = lowpart_subreg (DImode, src, TImode);
   28163            1 :           emit_insn (gen_rtx_SET (temp1, gen_rtx_VEC_CONCAT (V2DImode, temp,
   28164              :                                                              const0_rtx)));
   28165            1 :           temp = gen_highpart (DImode, src);
   28166            1 :           emit_insn (gen_rtx_SET (temp2, gen_rtx_VEC_CONCAT (V2DImode, temp,
   28167              :                                                              const0_rtx)));
   28168              :         }
   28169              :       else
   28170              :         {
   28171            0 :           temp = lowpart_subreg (SImode, src, DImode);
   28172            0 :           emit_insn (gen_vec_setv4si_0 (temp1, CONST0_RTX (V4SImode), temp));
   28173            0 :           temp = gen_highpart (SImode, src);
   28174            0 :           emit_insn (gen_vec_setv4si_0 (temp2, CONST0_RTX (V4SImode), temp));
   28175            0 :           temp1 = lowpart_subreg (V2DImode, temp1, V4SImode);
   28176            0 :           temp2 = lowpart_subreg (V2DImode, temp2, V4SImode);
   28177              :         }
   28178            1 :       temp = gen_reg_rtx (V2DImode);
   28179            1 :       emit_insn (gen_vec_interleave_lowv2di (temp, temp1, temp2));
   28180              :     }
   28181            4 :   else if (mode != DImode)
   28182              :     {
   28183            3 :       if (mode != SImode)
   28184              :         {
   28185            2 :           src = force_reg (mode, src);
   28186            2 :           src = lowpart_subreg (SImode, src, mode);
   28187              :         }
   28188            3 :       temp = gen_reg_rtx (V4SImode);
   28189            3 :       emit_insn (gen_vec_setv4si_0 (temp, CONST0_RTX (V4SImode), src));
   28190              :     }
   28191              :   else
   28192              :     {
   28193            1 :       temp = gen_reg_rtx (V2DImode);
   28194            1 :       emit_insn (gen_rtx_SET (temp, gen_rtx_VEC_CONCAT (V2DImode, src,
   28195              :                                                         const0_rtx)));
   28196              :     }
   28197            5 :   src = temp;
   28198            5 :   temp = gen_reg_rtx (V16QImode);
   28199            5 :   rtx src2 = gen_rtx_CONST_VECTOR (V16QImode,
   28200              :                                    gen_rtvec (16, GEN_INT (1), GEN_INT (2),
   28201              :                                               GEN_INT (4), GEN_INT (8),
   28202              :                                               GEN_INT (16), GEN_INT (32),
   28203              :                                               GEN_INT (64), GEN_INT (-128),
   28204              :                                               GEN_INT (1), GEN_INT (2),
   28205              :                                               GEN_INT (4), GEN_INT (8),
   28206              :                                               GEN_INT (16), GEN_INT (32),
   28207              :                                               GEN_INT (64), GEN_INT (-128)));
   28208            5 :   src2 = validize_mem (force_const_mem (V16QImode, src2));
   28209            5 :   src = lowpart_subreg (V16QImode, src, GET_MODE (src));
   28210            5 :   emit_insn (gen_vgf2p8affineqb_v16qi (temp, src, src2, const0_rtx));
   28211            5 :   if (mode == QImode)
   28212              :     {
   28213            1 :       rtx temp1 = gen_reg_rtx (SImode);
   28214            1 :       rtx temp2 = lowpart_subreg (V4SImode, temp, V16QImode);
   28215            1 :       rtx temp3 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   28216            1 :       emit_insn (gen_rtx_SET (temp1,
   28217              :                               gen_rtx_VEC_SELECT (SImode, temp2, temp3)));
   28218            1 :       emit_move_insn (dest, lowpart_subreg (QImode, temp1, SImode));
   28219            1 :       return;
   28220              :     }
   28221           11 :   rtx target = gen_reg_rtx ((GET_MODE_SIZE (mode) < 4 || !TARGET_64BIT)
   28222            3 :                             ? SImode : mode == TImode ? DImode : mode);
   28223            4 :   emit_move_insn (target, lowpart_subreg (GET_MODE (target), temp, V16QImode));
   28224            8 :   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
   28225              :     {
   28226            1 :       rtx temp1 = gen_reg_rtx (GET_MODE (target));
   28227            1 :       if (mode == TImode || TARGET_SSE4_1)
   28228              :         {
   28229            1 :           rtx temp2 = lowpart_subreg (mode == TImode ? V2DImode : V4SImode,
   28230              :                                       temp, V16QImode);
   28231            1 :           rtx temp3 = gen_rtx_PARALLEL (VOIDmode,
   28232              :                                         gen_rtvec (1, GEN_INT (mode == TImode
   28233              :                                                                ? 1 : 2)));
   28234            1 :           emit_insn (gen_rtx_SET (temp1,
   28235              :                                   gen_rtx_VEC_SELECT (GET_MODE (target), temp2,
   28236              :                                                       temp3)));
   28237            1 :         }
   28238              :       else
   28239              :         {
   28240            0 :           rtx temp2 = gen_reg_rtx (V4SImode);
   28241            0 :           rtx temp3 = lowpart_subreg (V4SImode, temp, V16QImode);
   28242            0 :           emit_insn (gen_sse2_pshufd (temp2, temp3, GEN_INT (0xaa)));
   28243            0 :           emit_move_insn (temp1, lowpart_subreg (GET_MODE (target), temp2,
   28244              :                                                  V4SImode));
   28245              :         }
   28246            1 :       rtx temp4 = gen_reg_rtx (GET_MODE (target));
   28247            1 :       rtx temp5 = gen_reg_rtx (GET_MODE (target));
   28248            0 :       rtx (*gen_bswap) (rtx, rtx)
   28249            1 :         = mode == TImode ? gen_bswapdi2 : gen_bswapsi2;
   28250            1 :       emit_insn (gen_bswap (temp4, target));
   28251            1 :       emit_insn (gen_bswap (temp5, temp1));
   28252            1 :       temp4 = gen_rtx_ZERO_EXTEND (mode, temp4);
   28253            1 :       temp5 = gen_rtx_ZERO_EXTEND (mode, temp5);
   28254            1 :       rtx shift = GEN_INT (GET_MODE_PRECISION (GET_MODE (target)));
   28255            1 :       temp4 = gen_rtx_ASHIFT (mode, temp4, shift);
   28256            1 :       emit_insn (gen_rtx_SET (dest, gen_rtx_IOR (mode, temp4, temp5)));
   28257            1 :       return;
   28258              :     }
   28259            3 :   if (mode == HImode)
   28260            1 :     target = lowpart_subreg (mode, target, SImode);
   28261            3 :   if (mode == SImode)
   28262            1 :     emit_insn (gen_bswapsi2 (dest, target));
   28263              :   else
   28264            2 :     emit_insn (gen_rtx_SET (dest, gen_rtx_BSWAP (mode, target)));
   28265              : }
   28266              : 
   28267              : #include "gt-i386-expand.h"
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.