LCOV - code coverage report
Current view: top level - gcc/config/i386 - i386-expand.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 86.9 % 15021 13060
Test Date: 2026-02-28 14:20:25 Functions: 93.7 % 270 253
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
       2              : 
       3              : This file is part of GCC.
       4              : 
       5              : GCC is free software; you can redistribute it and/or modify
       6              : it under the terms of the GNU General Public License as published by
       7              : the Free Software Foundation; either version 3, or (at your option)
       8              : any later version.
       9              : 
      10              : GCC is distributed in the hope that it will be useful,
      11              : but WITHOUT ANY WARRANTY; without even the implied warranty of
      12              : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13              : GNU General Public License for more details.
      14              : 
      15              : You should have received a copy of the GNU General Public License
      16              : along with GCC; see the file COPYING3.  If not see
      17              : <http://www.gnu.org/licenses/>.  */
      18              : 
      19              : #define IN_TARGET_CODE 1
      20              : 
      21              : #include "config.h"
      22              : #include "system.h"
      23              : #include "coretypes.h"
      24              : #include "backend.h"
      25              : #include "rtl.h"
      26              : #include "tree.h"
      27              : #include "memmodel.h"
      28              : #include "gimple.h"
      29              : #include "cfghooks.h"
      30              : #include "cfgloop.h"
      31              : #include "df.h"
      32              : #include "tm_p.h"
      33              : #include "stringpool.h"
      34              : #include "expmed.h"
      35              : #include "optabs.h"
      36              : #include "regs.h"
      37              : #include "emit-rtl.h"
      38              : #include "recog.h"
      39              : #include "cgraph.h"
      40              : #include "diagnostic.h"
      41              : #include "cfgbuild.h"
      42              : #include "alias.h"
      43              : #include "fold-const.h"
      44              : #include "attribs.h"
      45              : #include "calls.h"
      46              : #include "stor-layout.h"
      47              : #include "varasm.h"
      48              : #include "output.h"
      49              : #include "insn-attr.h"
      50              : #include "flags.h"
      51              : #include "except.h"
      52              : #include "explow.h"
      53              : #include "expr.h"
      54              : #include "cfgrtl.h"
      55              : #include "common/common-target.h"
      56              : #include "langhooks.h"
      57              : #include "reload.h"
      58              : #include "gimplify.h"
      59              : #include "dwarf2.h"
      60              : #include "tm-constrs.h"
      61              : #include "cselib.h"
      62              : #include "sched-int.h"
      63              : #include "opts.h"
      64              : #include "tree-pass.h"
      65              : #include "context.h"
      66              : #include "pass_manager.h"
      67              : #include "target-globals.h"
      68              : #include "gimple-iterator.h"
      69              : #include "shrink-wrap.h"
      70              : #include "builtins.h"
      71              : #include "rtl-iter.h"
      72              : #include "tree-iterator.h"
      73              : #include "dbgcnt.h"
      74              : #include "case-cfn-macros.h"
      75              : #include "dojump.h"
      76              : #include "fold-const-call.h"
      77              : #include "tree-vrp.h"
      78              : #include "tree-ssanames.h"
      79              : #include "selftest.h"
      80              : #include "selftest-rtl.h"
      81              : #include "print-rtl.h"
      82              : #include "intl.h"
      83              : #include "ifcvt.h"
      84              : #include "symbol-summary.h"
      85              : #include "sreal.h"
      86              : #include "ipa-cp.h"
      87              : #include "ipa-prop.h"
      88              : #include "ipa-fnsummary.h"
      89              : #include "wide-int-bitmask.h"
      90              : #include "tree-vector-builder.h"
      91              : #include "debug.h"
      92              : #include "dwarf2out.h"
      93              : #include "i386-options.h"
      94              : #include "i386-builtins.h"
      95              : #include "i386-expand.h"
      96              : #include "asan.h"
      97              : 
      98              : /* Split one or more double-mode RTL references into pairs of half-mode
      99              :    references.  The RTL can be REG, offsettable MEM, integer constant, or
     100              :    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
     101              :    split and "num" is its length.  lo_half and hi_half are output arrays
     102              :    that parallel "operands".  */
     103              : 
     104              : void
     105      4150663 : split_double_mode (machine_mode mode, rtx operands[],
     106              :                    int num, rtx lo_half[], rtx hi_half[])
     107              : {
     108      4150663 :   machine_mode half_mode;
     109      4150663 :   unsigned int byte;
     110      4150663 :   rtx mem_op = NULL_RTX;
     111      4150663 :   int mem_num = 0;
     112              : 
     113      4150663 :   switch (mode)
     114              :     {
     115              :     case E_TImode:
     116              :       half_mode = DImode;
     117              :       break;
     118       605755 :     case E_DImode:
     119       605755 :       half_mode = SImode;
     120       605755 :       break;
     121            6 :     case E_P2HImode:
     122            6 :       half_mode = HImode;
     123            6 :       break;
     124           30 :     case E_P2QImode:
     125           30 :       half_mode = QImode;
     126           30 :       break;
     127            0 :     default:
     128            0 :       gcc_unreachable ();
     129              :     }
     130              : 
     131      4150663 :   byte = GET_MODE_SIZE (half_mode);
     132              : 
     133      8515876 :   while (num--)
     134              :     {
     135      4365213 :       rtx op = operands[num];
     136              : 
     137              :       /* simplify_subreg refuse to split volatile memory addresses,
     138              :          but we still have to handle it.  */
     139      4365213 :       if (MEM_P (op))
     140              :         {
     141      1729593 :           if (mem_op && rtx_equal_p (op, mem_op))
     142              :             {
     143         2426 :               lo_half[num] = lo_half[mem_num];
     144         2426 :               hi_half[num] = hi_half[mem_num];
     145              :             }
     146              :           else
     147              :             {
     148      1727167 :               mem_op = op;
     149      1727167 :               mem_num = num;
     150      1727167 :               lo_half[num] = adjust_address (op, half_mode, 0);
     151      1727167 :               hi_half[num] = adjust_address (op, half_mode, byte);
     152              :             }
     153              :         }
     154              :       else
     155              :         {
     156      2635620 :           lo_half[num] = simplify_gen_subreg (half_mode, op,
     157      2635620 :                                               GET_MODE (op) == VOIDmode
     158              :                                               ? mode : GET_MODE (op), 0);
     159              : 
     160      2635620 :           rtx tmp = simplify_gen_subreg (half_mode, op,
     161      2635620 :                                          GET_MODE (op) == VOIDmode
     162      2635620 :                                          ? mode : GET_MODE (op), byte);
     163              :           /* simplify_gen_subreg will return NULL RTX for the
     164              :              high half of the paradoxical subreg. */
     165      2635620 :           hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
     166              :         }
     167              :     }
     168      4150663 : }
     169              : 
     170              : /* Emit the double word assignment DST = { LO, HI }.  */
     171              : 
     172              : void
     173       101248 : split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
     174              : {
     175       101248 :   rtx dlo, dhi;
     176       101248 :   int deleted_move_count = 0;
     177       101248 :   split_double_mode (mode, &dst, 1, &dlo, &dhi);
     178              :   /* Constraints ensure that if both lo and hi are MEMs, then
     179              :      dst has early-clobber and thus addresses of MEMs don't use
     180              :      dlo/dhi registers.  Otherwise if at least one of li and hi are MEMs,
     181              :      dlo/dhi are registers.  */
     182       101248 :   if (MEM_P (lo)
     183         5524 :       && rtx_equal_p (dlo, hi)
     184       102237 :       && reg_overlap_mentioned_p (dhi, lo))
     185              :     {
     186              :       /* If dlo is same as hi and lo's address uses dhi register,
     187              :          code below would first emit_move_insn (dhi, hi)
     188              :          and then emit_move_insn (dlo, lo).  But the former
     189              :          would invalidate lo's address.  Load into dhi first,
     190              :          then swap.  */
     191          193 :       emit_move_insn (dhi, lo);
     192          193 :       lo = dhi;
     193              :     }
     194       101055 :   else if (MEM_P (hi)
     195         9657 :            && !MEM_P (lo)
     196         6800 :            && !rtx_equal_p (dlo, lo)
     197       102618 :            && reg_overlap_mentioned_p (dlo, hi))
     198              :     {
     199              :       /* In this case, code below would first emit_move_insn (dlo, lo)
     200              :          and then emit_move_insn (dhi, hi).  But the former would
     201              :          invalidate hi's address.  */
     202           15 :       if (rtx_equal_p (dhi, lo))
     203              :         {
     204              :           /* We can't load into dhi first, so load into dlo
     205              :              first and we'll swap.  */
     206            9 :           emit_move_insn (dlo, hi);
     207            9 :           hi = dlo;
     208              :         }
     209              :       else
     210              :         {
     211              :           /* Load into dhi first.  */
     212            6 :           emit_move_insn (dhi, hi);
     213            6 :           hi = dhi;
     214              :         }
     215              :     }
     216       101248 :   if (!rtx_equal_p (dlo, hi))
     217              :     {
     218        87460 :       if (!rtx_equal_p (dlo, lo))
     219        38644 :         emit_move_insn (dlo, lo);
     220              :       else
     221              :         deleted_move_count++;
     222        87460 :       if (!rtx_equal_p (dhi, hi))
     223        81378 :         emit_move_insn (dhi, hi);
     224              :       else
     225         6082 :         deleted_move_count++;
     226              :     }
     227        13788 :   else if (!rtx_equal_p (lo, dhi))
     228              :     {
     229         6857 :       if (!rtx_equal_p (dhi, hi))
     230         6857 :         emit_move_insn (dhi, hi);
     231              :       else
     232              :         deleted_move_count++;
     233         6857 :       if (!rtx_equal_p (dlo, lo))
     234         6761 :         emit_move_insn (dlo, lo);
     235              :       else
     236           96 :         deleted_move_count++;
     237              :     }
     238         6931 :   else if (mode == TImode)
     239         6913 :     emit_insn (gen_swapdi (dlo, dhi));
     240              :   else
     241           18 :     emit_insn (gen_swapsi (dlo, dhi));
     242              : 
     243       101248 :   if (deleted_move_count == 2)
     244         3202 :     emit_note (NOTE_INSN_DELETED);
     245       101248 : }
     246              : 
     247              : 
     248              : /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
     249              :    for the target.  */
     250              : 
     251              : void
     252       112711 : ix86_expand_clear (rtx dest)
     253              : {
     254       112711 :   rtx tmp;
     255              : 
     256              :   /* We play register width games, which are only valid after reload.  */
     257       112711 :   gcc_assert (reload_completed);
     258              : 
     259              :   /* Avoid HImode and its attendant prefix byte.  */
     260       225422 :   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
     261          991 :     dest = gen_rtx_REG (SImode, REGNO (dest));
     262       112711 :   tmp = gen_rtx_SET (dest, const0_rtx);
     263              : 
     264       112711 :   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
     265              :     {
     266       112711 :       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
     267       112711 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
     268              :     }
     269              : 
     270       112711 :   emit_insn (tmp);
     271       112711 : }
     272              : 
     273              : /* Return true if V can be broadcasted from an integer of WIDTH bits
     274              :    which is returned in VAL_BROADCAST.  Otherwise, return false.  */
     275              : 
     276              : static bool
     277         4851 : ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
     278              :                 HOST_WIDE_INT &val_broadcast)
     279              : {
     280         4851 :   wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
     281         4851 :   val_broadcast = wi::extract_uhwi (val, 0, width);
     282         6543 :   for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
     283              :     {
     284         5089 :       HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
     285         5089 :       if (val_broadcast != each)
     286              :         return false;
     287              :     }
     288         1454 :   val_broadcast = sext_hwi (val_broadcast, width);
     289         1454 :   return true;
     290         4851 : }
     291              : 
     292              : /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE.  */
     293              : 
     294              : rtx
     295        35293 : ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
     296              : {
     297              :   /* Don't use integer vector broadcast if we can't move from GPR to SSE
     298              :      register directly.  */
     299        35293 :   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
     300              :     return nullptr;
     301              : 
     302        35293 :   unsigned int msize = GET_MODE_SIZE (mode);
     303              : 
     304              :   /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm.  */
     305        35293 :   if (msize != 16 && msize != 32 && msize != 64)
     306              :     return nullptr;
     307              : 
     308              :   /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
     309              :      broadcast only if vector broadcast is available.  */
     310        35293 :   if (!TARGET_AVX
     311         1610 :       || !CONST_WIDE_INT_P (op)
     312         1603 :       || standard_sse_constant_p (op, mode)
     313        36896 :       || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
     314         1603 :           != GET_MODE_BITSIZE (mode)))
     315        33698 :     return nullptr;
     316              : 
     317         1595 :   HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
     318         1595 :   HOST_WIDE_INT val_broadcast;
     319         1595 :   scalar_int_mode broadcast_mode;
     320              :   /* vpbroadcastb zmm requires TARGET_AVX512BW.  */
     321          712 :   if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
     322         2089 :       && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
     323              :                          val_broadcast))
     324              :     broadcast_mode = QImode;
     325          654 :   else if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
     326         1968 :            && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
     327              :                               val_broadcast))
     328              :     broadcast_mode = HImode;
     329              :   /* vbroadcasts[sd] only support memory operand w/o AVX2.
     330              :      When msize == 16, pshufs is used for vec_duplicate.
     331              :      when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be existed.  */
     332          412 :   else if ((msize != 32 || TARGET_AVX2)
     333         1768 :            && ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
     334              :                            val_broadcast))
     335              :     broadcast_mode = SImode;
     336         1391 :   else if (TARGET_64BIT && (msize != 32 || TARGET_AVX2)
     337         2641 :            && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
     338              :                               val_broadcast))
     339              :     broadcast_mode = DImode;
     340              :   else
     341          141 :     return nullptr;
     342              : 
     343              :   /* Check if OP can be broadcasted from VAL.  */
     344         1776 :   for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
     345         1561 :     if (val != CONST_WIDE_INT_ELT (op, i))
     346              :       return nullptr;
     347              : 
     348          215 :   unsigned int nunits = (GET_MODE_SIZE (mode)
     349          215 :                          / GET_MODE_SIZE (broadcast_mode));
     350          215 :   machine_mode vector_mode;
     351          215 :   if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
     352            0 :     gcc_unreachable ();
     353          215 :   rtx target = gen_reg_rtx (vector_mode);
     354          215 :   bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
     355              :                                                target,
     356              :                                                GEN_INT (val_broadcast));
     357          215 :   if (!ok)
     358              :     return nullptr;
     359          215 :   target = lowpart_subreg (mode, target, vector_mode);
     360          215 :   return target;
     361              : }
     362              : 
     363              : void
     364     73032809 : ix86_expand_move (machine_mode mode, rtx operands[])
     365              : {
     366     73032809 :   rtx op0, op1;
     367     73032809 :   rtx tmp, addend = NULL_RTX;
     368     73032809 :   enum tls_model model;
     369              : 
     370     73032809 :   op0 = operands[0];
     371     73032809 :   op1 = operands[1];
     372              : 
     373              :   /* Avoid complex sets of likely spilled hard registers before reload.  */
     374     73032809 :   if (!ix86_hardreg_mov_ok (op0, op1))
     375              :     {
     376       139360 :       tmp = gen_reg_rtx (mode);
     377       139360 :       operands[0] = tmp;
     378       139360 :       ix86_expand_move (mode, operands);
     379       139360 :       operands[0] = op0;
     380       139360 :       operands[1] = tmp;
     381       139360 :       op1 = tmp;
     382              :     }
     383              : 
     384     73032809 :   switch (GET_CODE (op1))
     385              :     {
     386       347483 :     case CONST:
     387       347483 :       tmp = XEXP (op1, 0);
     388              : 
     389       347483 :       if (GET_CODE (tmp) != PLUS
     390       335812 :           || !SYMBOL_REF_P (XEXP (tmp, 0)))
     391              :         break;
     392              : 
     393       333149 :       op1 = XEXP (tmp, 0);
     394       333149 :       addend = XEXP (tmp, 1);
     395              :       /* FALLTHRU */
     396              : 
     397      4881389 :     case SYMBOL_REF:
     398      4881389 :       model = SYMBOL_REF_TLS_MODEL (op1);
     399              : 
     400      4881389 :       if (model)
     401        10126 :         op1 = legitimize_tls_address (op1, model, true);
     402      4871263 :       else if (ix86_force_load_from_GOT_p (op1))
     403              :         {
     404              :           /* Load the external function address via GOT slot to avoid PLT.  */
     405           24 :           op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
     406              :                                 (TARGET_64BIT
     407              :                                  ? UNSPEC_GOTPCREL
     408              :                                  : UNSPEC_GOT));
     409           24 :           op1 = gen_rtx_CONST (Pmode, op1);
     410           24 :           op1 = gen_const_mem (Pmode, op1);
     411           20 :           set_mem_alias_set (op1, GOT_ALIAS_SET);
     412              :         }
     413              :       else
     414              :         {
     415              : #if TARGET_PECOFF
     416              :           tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
     417              : 
     418              :           if (tmp)
     419              :             {
     420              :               op1 = tmp;
     421              :               if (!addend)
     422              :                 break;
     423              :             }
     424              :           else
     425              : #endif
     426      4871243 :             {
     427      4871243 :               op1 = operands[1];
     428      4871243 :               break;
     429              :             }
     430              :         }
     431              : 
     432        10146 :       if (addend)
     433              :         {
     434         2786 :           op1 = force_operand (op1, NULL_RTX);
     435         2795 :           op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
     436              :                                      op0, 1, OPTAB_DIRECT);
     437              :         }
     438              :       else
     439         7360 :         op1 = force_operand (op1, op0);
     440              : 
     441        10146 :       if (op1 == op0)
     442              :         return;
     443              : 
     444         1147 :       op1 = convert_to_mode (mode, op1, 1);
     445              : 
     446              :     default:
     447              :       break;
     448              : 
     449      1491586 :     case SUBREG:
     450              :       /* Transform TImode paradoxical SUBREG into zero_extendditi2.  */
     451      1491586 :       if (TARGET_64BIT
     452      1264538 :           && mode == TImode
     453              :           && SUBREG_P (op1)
     454        74465 :           && GET_MODE (SUBREG_REG (op1)) == DImode
     455      1537622 :           && SUBREG_BYTE (op1) == 0)
     456        46036 :         op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
     457              :       /* As not all values in XFmode are representable in real_value,
     458              :          we might be called with unfoldable SUBREGs of constants.  */
     459      1491586 :       if (mode == XFmode
     460         3130 :           && CONSTANT_P (SUBREG_REG (op1))
     461            0 :           && can_create_pseudo_p ())
     462              :         {
     463            0 :           machine_mode imode = GET_MODE (SUBREG_REG (op1));
     464            0 :           rtx r = force_const_mem (imode, SUBREG_REG (op1));
     465            0 :           if (r)
     466            0 :             r = validize_mem (r);
     467              :           else
     468            0 :             r = force_reg (imode, SUBREG_REG (op1));
     469            0 :           op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
     470              :         }
     471              :       break;
     472              :     }
     473              : 
     474     73023810 :   if ((flag_pic || MACHOPIC_INDIRECT)
     475     73023810 :       && symbolic_operand (op1, mode))
     476              :     {
     477              : #if TARGET_MACHO
     478              :       if (TARGET_MACHO && !TARGET_64BIT)
     479              :         {
     480              :           /* dynamic-no-pic */
     481              :           if (MACHOPIC_INDIRECT)
     482              :             {
     483              :               tmp = (op0 && REG_P (op0) && mode == Pmode)
     484              :                     ? op0 : gen_reg_rtx (Pmode);
     485              :               op1 = machopic_indirect_data_reference (op1, tmp);
     486              :               if (MACHOPIC_PURE)
     487              :                 op1 = machopic_legitimize_pic_address (op1, mode,
     488              :                                                        tmp == op1 ? 0 : tmp);
     489              :             }
     490              :           if (op0 != op1 && !MEM_P (op0))
     491              :             {
     492              :               rtx insn = gen_rtx_SET (op0, op1);
     493              :               emit_insn (insn);
     494              :               return;
     495              :             }
     496              :         }
     497              : #endif
     498              : 
     499       333440 :       if (MEM_P (op0))
     500        87244 :         op1 = force_reg (mode, op1);
     501       246196 :       else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
     502              :         {
     503       246156 :           rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
     504       246156 :           op1 = legitimize_pic_address (op1, reg);
     505       246156 :           if (op0 == op1)
     506              :             return;
     507       246156 :           op1 = convert_to_mode (mode, op1, 1);
     508              :         }
     509              :     }
     510              :   else
     511              :     {
     512     72690370 :       if (MEM_P (op0)
     513     99097018 :           && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
     514     10673619 :               || !push_operand (op0, mode))
     515     84895560 :           && MEM_P (op1))
     516      2159040 :         op1 = force_reg (mode, op1);
     517              : 
     518     72690370 :       if (push_operand (op0, mode)
     519     72690370 :           && ! general_no_elim_operand (op1, mode))
     520         1004 :         op1 = copy_to_mode_reg (mode, op1);
     521              : 
     522              :       /* Force large constants in 64bit compilation into register
     523              :          to get them CSEed.  */
     524     72690370 :       if (can_create_pseudo_p ()
     525     66999401 :           && (mode == DImode) && TARGET_64BIT
     526     34834846 :           && immediate_operand (op1, mode)
     527      7863976 :           && !x86_64_zext_immediate_operand (op1, VOIDmode)
     528       717045 :           && !register_operand (op0, mode)
     529     72864013 :           && optimize)
     530       121927 :         op1 = copy_to_mode_reg (mode, op1);
     531              : 
     532     72690370 :       if (can_create_pseudo_p ())
     533              :         {
     534     66999401 :           if (CONST_DOUBLE_P (op1))
     535              :             {
     536              :               /* If we are loading a floating point constant to a
     537              :                  register, force the value to memory now, since we'll
     538              :                  get better code out the back end.  */
     539              : 
     540       896725 :               op1 = validize_mem (force_const_mem (mode, op1));
     541       896725 :               if (!register_operand (op0, mode))
     542              :                 {
     543       129652 :                   tmp = gen_reg_rtx (mode);
     544       129652 :                   emit_insn (gen_rtx_SET (tmp, op1));
     545       129652 :                   emit_move_insn (op0, tmp);
     546       129652 :                   return;
     547              :                 }
     548              :             }
     549              :         }
     550              :     }
     551              : 
     552              :   /* Special case inserting 64-bit values into a TImode register.  */
     553     72894158 :   if (TARGET_64BIT
     554              :       /* Disable for -O0 (see PR110587) unless naked (PR110533).  */
     555     63213413 :       && (optimize || ix86_function_naked (current_function_decl))
     556     43463148 :       && (mode == DImode || mode == DFmode)
     557     29547109 :       && SUBREG_P (op0)
     558       479374 :       && GET_MODE (SUBREG_REG (op0)) == TImode
     559       396601 :       && REG_P (SUBREG_REG (op0))
     560     73290759 :       && REG_P (op1))
     561              :     {
     562              :       /* Use *insvti_lowpart_1 to set lowpart.  */
     563       180336 :       if (SUBREG_BYTE (op0) == 0)
     564              :         {
     565        53951 :           wide_int mask = wi::mask (64, true, 128);
     566        53951 :           tmp = immed_wide_int_const (mask, TImode);
     567        53951 :           op0 = SUBREG_REG (op0);
     568        53951 :           tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
     569        53951 :           if (mode == DFmode)
     570          355 :             op1 = gen_lowpart (DImode, op1);
     571        53951 :           op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
     572        53951 :           op1 = gen_rtx_IOR (TImode, tmp, op1);
     573        53951 :         }
     574              :       /* Use *insvti_highpart_1 to set highpart.  */
     575       126385 :       else if (SUBREG_BYTE (op0) == 8)
     576              :         {
     577       126385 :           wide_int mask = wi::mask (64, false, 128);
     578       126385 :           tmp = immed_wide_int_const (mask, TImode);
     579       126385 :           op0 = SUBREG_REG (op0);
     580       126385 :           tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
     581       126385 :           if (mode == DFmode)
     582          206 :             op1 = gen_lowpart (DImode, op1);
     583       126385 :           op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
     584       126385 :           op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
     585       126385 :           op1 = gen_rtx_IOR (TImode, tmp, op1);
     586       126385 :         }
     587              :     }
     588              : 
     589     72894158 :   emit_insn (gen_rtx_SET (op0, op1));
     590              : }
     591              : 
     592              : /* OP is a memref of CONST_VECTOR, return scalar constant mem
     593              :    if CONST_VECTOR is a vec_duplicate, else return NULL.  */
     594              : rtx
     595      2461728 : ix86_broadcast_from_constant (machine_mode mode, rtx op)
     596              : {
     597      2461728 :   int nunits = GET_MODE_NUNITS (mode);
     598      2461728 :   if (nunits < 2)
     599              :     return nullptr;
     600              : 
     601              :   /* Don't use integer vector broadcast if we can't move from GPR to SSE
     602              :      register directly.  */
     603      2323341 :   if (!TARGET_INTER_UNIT_MOVES_TO_VEC
     604         8172 :       && INTEGRAL_MODE_P (mode))
     605              :     return nullptr;
     606              : 
     607              :   /* Convert CONST_VECTOR to a non-standard SSE constant integer
     608              :      broadcast only if vector broadcast is available.  */
     609      2317779 :   if (standard_sse_constant_p (op, mode))
     610              :     return nullptr;
     611              : 
     612      4635552 :   if (GET_MODE_INNER (mode) == TImode)
     613              :     return nullptr;
     614              : 
     615      2317666 :   rtx constant = get_pool_constant (XEXP (op, 0));
     616      2317666 :   if (!CONST_VECTOR_P (constant))
     617              :     return nullptr;
     618              : 
     619              :   /* There could be some rtx like
     620              :      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
     621              :      but with "*.LC1" refer to V2DI constant vector.  */
     622      2317666 :   if (GET_MODE (constant) != mode)
     623              :     {
     624          609 :       constant = simplify_subreg (mode, constant, GET_MODE (constant),
     625              :                                   0);
     626          609 :       if (constant == nullptr || !CONST_VECTOR_P (constant))
     627              :         return nullptr;
     628              :     }
     629              : 
     630      2317666 :   rtx first = XVECEXP (constant, 0, 0);
     631              : 
     632      7628262 :   for (int i = 1; i < nunits; ++i)
     633              :     {
     634      7016480 :       rtx tmp = XVECEXP (constant, 0, i);
     635              :       /* Vector duplicate value.  */
     636      7016480 :       if (!rtx_equal_p (tmp, first))
     637              :         return nullptr;
     638              :     }
     639              : 
     640              :   return first;
     641              : }
     642              : 
     643              : void
     644      4731949 : ix86_expand_vector_move (machine_mode mode, rtx operands[])
     645              : {
     646      4731949 :   rtx op0 = operands[0], op1 = operands[1];
     647              :   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
     648              :      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
     649      4731949 :   unsigned int align = (TARGET_IAMCU
     650      4731949 :                         ? GET_MODE_BITSIZE (mode)
     651      4731949 :                         : GET_MODE_ALIGNMENT (mode));
     652              : 
     653      4731949 :   if (push_operand (op0, VOIDmode))
     654         2875 :     op0 = emit_move_resolve_push (mode, op0);
     655              : 
     656              :   /* Force constants other than zero into memory.  We do not know how
     657              :      the instructions used to build constants modify the upper 64 bits
     658              :      of the register, once we have that information we may be able
     659              :      to handle some of them more efficiently.  */
     660      4731949 :   if (can_create_pseudo_p ()
     661      4537479 :       && (CONSTANT_P (op1)
     662      4227665 :           || (SUBREG_P (op1)
     663       305643 :               && CONSTANT_P (SUBREG_REG (op1))))
     664      5041777 :       && ((register_operand (op0, mode)
     665       256345 :            && !standard_sse_constant_p (op1, mode))
     666              :           /* ix86_expand_vector_move_misalign() does not like constants.  */
     667              :           || (SSE_REG_MODE_P (mode)
     668       251859 :               && MEM_P (op0)
     669        37505 :               && MEM_ALIGN (op0) < align)))
     670              :     {
     671         2065 :       if (SUBREG_P (op1))
     672              :         {
     673           14 :           machine_mode imode = GET_MODE (SUBREG_REG (op1));
     674           14 :           rtx r = force_const_mem (imode, SUBREG_REG (op1));
     675           14 :           if (r)
     676           14 :             r = validize_mem (r);
     677              :           else
     678            0 :             r = force_reg (imode, SUBREG_REG (op1));
     679           14 :           op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
     680              :         }
     681              :       else
     682              :         {
     683         2051 :           machine_mode mode = GET_MODE (op0);
     684         2051 :           rtx tmp = ix86_convert_const_wide_int_to_broadcast
     685         2051 :             (mode, op1);
     686         2051 :           if (tmp == nullptr)
     687         2030 :             op1 = validize_mem (force_const_mem (mode, op1));
     688              :           else
     689              :             op1 = tmp;
     690              :         }
     691              :     }
     692              : 
     693      4731949 :   if (can_create_pseudo_p ()
     694      4537479 :       && GET_MODE_SIZE (mode) >= 16
     695      3828947 :       && VECTOR_MODE_P (mode)
     696      8346178 :       && (MEM_P (op1)
     697       865749 :           && SYMBOL_REF_P (XEXP (op1, 0))
     698       488856 :           && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
     699              :     {
     700       472235 :       rtx first = ix86_broadcast_from_constant (mode, op1);
     701       472235 :       if (first != nullptr)
     702              :         {
     703              :           /* Broadcast to XMM/YMM/ZMM register from an integer
     704              :              constant or scalar mem.  */
     705       120637 :           rtx tmp = gen_reg_rtx (mode);
     706       120637 :           if (FLOAT_MODE_P (mode))
     707        29290 :             first = force_const_mem (GET_MODE_INNER (mode), first);
     708       120637 :           bool ok = ix86_expand_vector_init_duplicate (false, mode,
     709              :                                                        tmp, first);
     710       120637 :           if (!ok && !TARGET_64BIT && GET_MODE_INNER (mode) == DImode)
     711              :             {
     712            0 :               first = force_const_mem (GET_MODE_INNER (mode), first);
     713            0 :               ok = ix86_expand_vector_init_duplicate (false, mode,
     714              :                                                       tmp, first);
     715              :             }
     716       120637 :           if (ok)
     717              :             {
     718       120637 :               emit_move_insn (op0, tmp);
     719       120637 :               return;
     720              :             }
     721              :         }
     722              :     }
     723              : 
     724              :   /* We need to check memory alignment for SSE mode since attribute
     725              :      can make operands unaligned.  */
     726      4611312 :   if (can_create_pseudo_p ()
     727              :       && SSE_REG_MODE_P (mode)
     728      9349283 :       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
     729      4157112 :           || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
     730              :     {
     731       488057 :       rtx tmp[2];
     732              : 
     733              :       /* ix86_expand_vector_move_misalign() does not like both
     734              :          arguments in memory.  */
     735       488057 :       if (!register_operand (op0, mode)
     736       488057 :           && !register_operand (op1, mode))
     737              :         {
     738       153252 :           rtx scratch = gen_reg_rtx (mode);
     739       153252 :           emit_move_insn (scratch, op1);
     740       153252 :           op1 = scratch;
     741              :         }
     742              : 
     743       488057 :       tmp[0] = op0; tmp[1] = op1;
     744       488057 :       ix86_expand_vector_move_misalign (mode, tmp);
     745       488057 :       return;
     746              :     }
     747              : 
     748              :   /* Special case TImode to 128-bit vector conversions via V2DI.  */
     749      1133337 :   if (VECTOR_MODE_P (mode)
     750      4072315 :       && GET_MODE_SIZE (mode) == 16
     751      2866126 :       && SUBREG_P (op1)
     752       236940 :       && GET_MODE (SUBREG_REG (op1)) == TImode
     753         3180 :       && TARGET_64BIT && TARGET_SSE
     754      4125782 :       && ix86_pre_reload_split ())
     755              :     {
     756         2425 :       rtx tmp = gen_reg_rtx (V2DImode);
     757         2425 :       rtx lo = gen_reg_rtx (DImode);
     758         2425 :       rtx hi = gen_reg_rtx (DImode);
     759         2425 :       emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
     760         2425 :       emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
     761         2425 :       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
     762         2425 :       emit_move_insn (op0, gen_lowpart (mode, tmp));
     763         2425 :       return;
     764              :     }
     765              : 
     766              :   /* If operand0 is a hard register, make operand1 a pseudo.  */
     767      4120830 :   if (can_create_pseudo_p ()
     768      8047190 :       && !ix86_hardreg_mov_ok (op0, op1))
     769              :     {
     770          125 :       rtx tmp = gen_reg_rtx (GET_MODE (op0));
     771          125 :       emit_move_insn (tmp, op1);
     772          125 :       emit_move_insn (op0, tmp);
     773          125 :       return;
     774              :     }
     775              : 
     776              :   /* Make operand1 a register if it isn't already.  */
     777      4120705 :   if (can_create_pseudo_p ()
     778      3926235 :       && !register_operand (op0, mode)
     779      5220383 :       && !register_operand (op1, mode))
     780              :     {
     781       212831 :       rtx tmp = gen_reg_rtx (GET_MODE (op0));
     782       212831 :       emit_move_insn (tmp, op1);
     783       212831 :       emit_move_insn (op0, tmp);
     784       212831 :       return;
     785              :     }
     786              : 
     787      3907874 :   emit_insn (gen_rtx_SET (op0, op1));
     788              : }
     789              : 
     790              : /* Split 32-byte AVX unaligned load and store if needed.  */
     791              : 
     792              : static void
     793        13457 : ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
     794              : {
     795        13457 :   rtx m;
     796        13457 :   rtx (*extract) (rtx, rtx, rtx);
     797        13457 :   machine_mode mode;
     798              : 
     799        13457 :   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
     800         4752 :       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
     801              :     {
     802        13405 :       emit_insn (gen_rtx_SET (op0, op1));
     803        13405 :       return;
     804              :     }
     805              : 
     806           52 :   rtx orig_op0 = NULL_RTX;
     807           52 :   mode = GET_MODE (op0);
     808           52 :   switch (GET_MODE_CLASS (mode))
     809              :     {
     810           35 :     case MODE_VECTOR_INT:
     811           35 :     case MODE_INT:
     812           35 :       if (mode != V32QImode)
     813              :         {
     814            7 :           if (!MEM_P (op0))
     815              :             {
     816            3 :               orig_op0 = op0;
     817            3 :               op0 = gen_reg_rtx (V32QImode);
     818              :             }
     819              :           else
     820            4 :             op0 = gen_lowpart (V32QImode, op0);
     821            7 :           op1 = gen_lowpart (V32QImode, op1);
     822            7 :           mode = V32QImode;
     823              :         }
     824              :       break;
     825              :     case MODE_VECTOR_FLOAT:
     826              :       break;
     827            0 :     default:
     828            0 :       gcc_unreachable ();
     829              :     }
     830              : 
     831           52 :   switch (mode)
     832              :     {
     833            0 :     default:
     834            0 :       gcc_unreachable ();
     835              :     case E_V32QImode:
     836              :       extract = gen_avx_vextractf128v32qi;
     837              :       mode = V16QImode;
     838              :       break;
     839            1 :     case E_V16BFmode:
     840            1 :       extract = gen_avx_vextractf128v16bf;
     841            1 :       mode = V8BFmode;
     842            1 :       break;
     843            0 :     case E_V16HFmode:
     844            0 :       extract = gen_avx_vextractf128v16hf;
     845            0 :       mode = V8HFmode;
     846            0 :       break;
     847            8 :     case E_V8SFmode:
     848            8 :       extract = gen_avx_vextractf128v8sf;
     849            8 :       mode = V4SFmode;
     850            8 :       break;
     851            8 :     case E_V4DFmode:
     852            8 :       extract = gen_avx_vextractf128v4df;
     853            8 :       mode = V2DFmode;
     854            8 :       break;
     855              :     }
     856              : 
     857           52 :   if (MEM_P (op1))
     858              :     {
     859            9 :       rtx r = gen_reg_rtx (mode);
     860            9 :       m = adjust_address (op1, mode, 0);
     861            9 :       emit_move_insn (r, m);
     862            9 :       m = adjust_address (op1, mode, 16);
     863            9 :       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
     864            9 :       emit_move_insn (op0, r);
     865              :     }
     866           43 :   else if (MEM_P (op0))
     867              :     {
     868           43 :       m = adjust_address (op0, mode, 0);
     869           43 :       emit_insn (extract (m, op1, const0_rtx));
     870           43 :       m = adjust_address (op0, mode, 16);
     871           43 :       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
     872              :     }
     873              :   else
     874            0 :     gcc_unreachable ();
     875              : 
     876           52 :   if (orig_op0)
     877            3 :     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
     878              : }
     879              : 
     880              : /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
     881              :    straight to ix86_expand_vector_move.  */
     882              : /* Code generation for scalar reg-reg moves of single and double precision data:
     883              :      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
     884              :        movaps reg, reg
     885              :      else
     886              :        movss reg, reg
     887              :      if (x86_sse_partial_reg_dependency == true)
     888              :        movapd reg, reg
     889              :      else
     890              :        movsd reg, reg
     891              : 
     892              :    Code generation for scalar loads of double precision data:
     893              :      if (x86_sse_split_regs == true)
     894              :        movlpd mem, reg      (gas syntax)
     895              :      else
     896              :        movsd mem, reg
     897              : 
     898              :    Code generation for unaligned packed loads of single precision data
     899              :    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
     900              :      if (x86_sse_unaligned_move_optimal)
     901              :        movups mem, reg
     902              : 
     903              :      if (x86_sse_partial_reg_dependency == true)
     904              :        {
     905              :          xorps  reg, reg
     906              :          movlps mem, reg
     907              :          movhps mem+8, reg
     908              :        }
     909              :      else
     910              :        {
     911              :          movlps mem, reg
     912              :          movhps mem+8, reg
     913              :        }
     914              : 
     915              :    Code generation for unaligned packed loads of double precision data
     916              :    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
     917              :      if (x86_sse_unaligned_move_optimal)
     918              :        movupd mem, reg
     919              : 
     920              :      if (x86_sse_split_regs == true)
     921              :        {
     922              :          movlpd mem, reg
     923              :          movhpd mem+8, reg
     924              :        }
     925              :      else
     926              :        {
     927              :          movsd  mem, reg
     928              :          movhpd mem+8, reg
     929              :        }
     930              :  */
     931              : 
     932              : void
     933       812728 : ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
     934              : {
     935       812728 :   rtx op0, op1, m;
     936              : 
     937       812728 :   op0 = operands[0];
     938       812728 :   op1 = operands[1];
     939              : 
     940              :   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
     941      1625456 :   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
     942              :     {
     943        24002 :       emit_insn (gen_rtx_SET (op0, op1));
     944        24002 :       return;
     945              :     }
     946              : 
     947       788726 :   if (TARGET_AVX)
     948              :     {
     949        62696 :       if (GET_MODE_SIZE (mode) == 32)
     950        13457 :         ix86_avx256_split_vector_move_misalign (op0, op1);
     951              :       else
     952              :         /* Always use 128-bit mov<mode>_internal pattern for AVX.  */
     953        17891 :         emit_insn (gen_rtx_SET (op0, op1));
     954        31348 :       return;
     955              :     }
     956              : 
     957       757378 :   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
     958           95 :       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
     959              :     {
     960       757283 :       emit_insn (gen_rtx_SET (op0, op1));
     961       757283 :       return;
     962              :     }
     963              : 
     964              :   /* ??? If we have typed data, then it would appear that using
     965              :      movdqu is the only way to get unaligned data loaded with
     966              :      integer type.  */
     967           95 :   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
     968              :     {
     969           81 :       emit_insn (gen_rtx_SET (op0, op1));
     970           81 :       return;
     971              :     }
     972              : 
     973           14 :   if (MEM_P (op1))
     974              :     {
     975            6 :       if (TARGET_SSE2 && mode == V2DFmode)
     976              :         {
     977            2 :           rtx zero;
     978              : 
     979              :           /* When SSE registers are split into halves, we can avoid
     980              :              writing to the top half twice.  */
     981            2 :           if (TARGET_SSE_SPLIT_REGS)
     982              :             {
     983            2 :               emit_clobber (op0);
     984            2 :               zero = op0;
     985              :             }
     986              :           else
     987              :             {
     988              :               /* ??? Not sure about the best option for the Intel chips.
     989              :                  The following would seem to satisfy; the register is
     990              :                  entirely cleared, breaking the dependency chain.  We
     991              :                  then store to the upper half, with a dependency depth
     992              :                  of one.  A rumor has it that Intel recommends two movsd
     993              :                  followed by an unpacklpd, but this is unconfirmed.  And
     994              :                  given that the dependency depth of the unpacklpd would
     995              :                  still be one, I'm not sure why this would be better.  */
     996            0 :               zero = CONST0_RTX (V2DFmode);
     997              :             }
     998              : 
     999            2 :           m = adjust_address (op1, DFmode, 0);
    1000            2 :           emit_insn (gen_sse2_loadlpd (op0, zero, m));
    1001            2 :           m = adjust_address (op1, DFmode, 8);
    1002            2 :           emit_insn (gen_sse2_loadhpd (op0, op0, m));
    1003            2 :         }
    1004              :       else
    1005              :         {
    1006            4 :           rtx t;
    1007              : 
    1008            4 :           if (mode != V4SFmode)
    1009            0 :             t = gen_reg_rtx (V4SFmode);
    1010              :           else
    1011              :             t = op0;
    1012              : 
    1013            4 :           if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
    1014            2 :             emit_move_insn (t, CONST0_RTX (V4SFmode));
    1015              :           else
    1016            2 :             emit_clobber (t);
    1017              : 
    1018            4 :           m = adjust_address (op1, V2SFmode, 0);
    1019            4 :           emit_insn (gen_sse_loadlps (t, t, m));
    1020            4 :           m = adjust_address (op1, V2SFmode, 8);
    1021            4 :           emit_insn (gen_sse_loadhps (t, t, m));
    1022            4 :           if (mode != V4SFmode)
    1023            0 :             emit_move_insn (op0, gen_lowpart (mode, t));
    1024              :         }
    1025              :     }
    1026            8 :   else if (MEM_P (op0))
    1027              :     {
    1028            8 :       if (TARGET_SSE2 && mode == V2DFmode)
    1029              :         {
    1030            2 :           m = adjust_address (op0, DFmode, 0);
    1031            2 :           emit_insn (gen_sse2_storelpd (m, op1));
    1032            2 :           m = adjust_address (op0, DFmode, 8);
    1033            2 :           emit_insn (gen_sse2_storehpd (m, op1));
    1034              :         }
    1035              :       else
    1036              :         {
    1037            6 :           if (mode != V4SFmode)
    1038            0 :             op1 = gen_lowpart (V4SFmode, op1);
    1039              : 
    1040            6 :           m = adjust_address (op0, V2SFmode, 0);
    1041            6 :           emit_insn (gen_sse_storelps (m, op1));
    1042            6 :           m = adjust_address (op0, V2SFmode, 8);
    1043            6 :           emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
    1044              :         }
    1045              :     }
    1046              :   else
    1047            0 :     gcc_unreachable ();
    1048              : }
    1049              : 
    1050              : /* Move bits 64:95 to bits 32:63.  */
    1051              : 
    1052              : void
    1053          868 : ix86_move_vector_high_sse_to_mmx (rtx op)
    1054              : {
    1055          868 :   rtx mask = gen_rtx_PARALLEL (VOIDmode,
    1056              :                                gen_rtvec (4, GEN_INT (0), GEN_INT (2),
    1057              :                                           GEN_INT (0), GEN_INT (0)));
    1058          868 :   rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
    1059          868 :   op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
    1060          868 :   rtx insn = gen_rtx_SET (dest, op);
    1061          868 :   emit_insn (insn);
    1062          868 : }
    1063              : 
    1064              : /* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
    1065              : 
    1066              : void
    1067          778 : ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
    1068              : {
    1069          778 :   rtx op0 = operands[0];
    1070          778 :   rtx op1 = operands[1];
    1071          778 :   rtx op2 = operands[2];
    1072          778 :   rtx src;
    1073              : 
    1074          778 :   machine_mode dmode = GET_MODE (op0);
    1075          778 :   machine_mode smode = GET_MODE (op1);
    1076          778 :   machine_mode inner_dmode = GET_MODE_INNER (dmode);
    1077          778 :   machine_mode inner_smode = GET_MODE_INNER (smode);
    1078              : 
    1079              :   /* Get the corresponding SSE mode for destination.  */
    1080          778 :   int nunits = 16 / GET_MODE_SIZE (inner_dmode);
    1081         1556 :   machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
    1082         1556 :                                             nunits).require ();
    1083          778 :   machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
    1084         1556 :                                                  nunits / 2).require ();
    1085              : 
    1086              :   /* Get the corresponding SSE mode for source.  */
    1087          778 :   nunits = 16 / GET_MODE_SIZE (inner_smode);
    1088         1556 :   machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
    1089         1556 :                                             nunits).require ();
    1090              : 
    1091              :   /* Generate SSE pack with signed/unsigned saturation.  */
    1092          778 :   rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
    1093          778 :   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
    1094          778 :   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
    1095              : 
    1096              :   /* paskusdw/packuswb does unsigned saturation of a signed source
    1097              :      which is different from generic us_truncate RTX.  */
    1098          778 :   if (code == US_TRUNCATE)
    1099          676 :     src = gen_rtx_UNSPEC (sse_dmode,
    1100              :                           gen_rtvec (2, op1, op2),
    1101              :                           UNSPEC_US_TRUNCATE);
    1102              :   else
    1103              :     {
    1104          102 :       op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
    1105          102 :       op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
    1106          102 :       src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
    1107              :     }
    1108              : 
    1109          778 :   emit_move_insn (dest, src);
    1110              : 
    1111          778 :   ix86_move_vector_high_sse_to_mmx (op0);
    1112          778 : }
    1113              : 
    1114              : /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  This is also used
    1115              :    for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
    1116              :    OPERANDS[0].  */
    1117              : 
    1118              : void
    1119         6063 : ix86_split_mmx_punpck (rtx operands[], bool high_p)
    1120              : {
    1121         6063 :   rtx op0 = operands[0];
    1122         6063 :   rtx op1 = operands[1];
    1123         6063 :   rtx op2 = operands[2];
    1124         6063 :   machine_mode mode = GET_MODE (op1);
    1125         6063 :   rtx mask;
    1126              :   /* The corresponding SSE mode.  */
    1127         6063 :   machine_mode sse_mode, double_sse_mode;
    1128              : 
    1129         6063 :   switch (mode)
    1130              :     {
    1131         1606 :     case E_V8QImode:
    1132         1606 :     case E_V4QImode:
    1133         1606 :     case E_V2QImode:
    1134         1606 :       sse_mode = V16QImode;
    1135         1606 :       double_sse_mode = V32QImode;
    1136         1606 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1137              :                                gen_rtvec (16,
    1138              :                                           GEN_INT (0), GEN_INT (16),
    1139              :                                           GEN_INT (1), GEN_INT (17),
    1140              :                                           GEN_INT (2), GEN_INT (18),
    1141              :                                           GEN_INT (3), GEN_INT (19),
    1142              :                                           GEN_INT (4), GEN_INT (20),
    1143              :                                           GEN_INT (5), GEN_INT (21),
    1144              :                                           GEN_INT (6), GEN_INT (22),
    1145              :                                           GEN_INT (7), GEN_INT (23)));
    1146         1606 :       break;
    1147              : 
    1148         3366 :     case E_V4HImode:
    1149         3366 :     case E_V2HImode:
    1150         3366 :       sse_mode = V8HImode;
    1151         3366 :       double_sse_mode = V16HImode;
    1152         3366 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1153              :                                gen_rtvec (8,
    1154              :                                           GEN_INT (0), GEN_INT (8),
    1155              :                                           GEN_INT (1), GEN_INT (9),
    1156              :                                           GEN_INT (2), GEN_INT (10),
    1157              :                                           GEN_INT (3), GEN_INT (11)));
    1158         3366 :       break;
    1159              : 
    1160          740 :     case E_V2SImode:
    1161          740 :       sse_mode = V4SImode;
    1162          740 :       double_sse_mode = V8SImode;
    1163          740 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1164              :                                gen_rtvec (4,
    1165              :                                           GEN_INT (0), GEN_INT (4),
    1166              :                                           GEN_INT (1), GEN_INT (5)));
    1167          740 :       break;
    1168              : 
    1169          351 :     case E_V2SFmode:
    1170          351 :       sse_mode = V4SFmode;
    1171          351 :       double_sse_mode = V8SFmode;
    1172          351 :       mask = gen_rtx_PARALLEL (VOIDmode,
    1173              :                                gen_rtvec (4,
    1174              :                                           GEN_INT (0), GEN_INT (4),
    1175              :                                           GEN_INT (1), GEN_INT (5)));
    1176          351 :       break;
    1177              : 
    1178            0 :     default:
    1179            0 :       gcc_unreachable ();
    1180              :     }
    1181              : 
    1182              :   /* Generate SSE punpcklXX.  */
    1183         6063 :   rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
    1184         6063 :   op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
    1185         6063 :   op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
    1186              : 
    1187         6063 :   op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
    1188         6063 :   op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
    1189         6063 :   rtx insn = gen_rtx_SET (dest, op2);
    1190         6063 :   emit_insn (insn);
    1191              : 
    1192              :   /* Move high bits to low bits.  */
    1193         6063 :   if (high_p)
    1194              :     {
    1195         2480 :       if (sse_mode == V4SFmode)
    1196              :         {
    1197          121 :           mask = gen_rtx_PARALLEL (VOIDmode,
    1198              :                                    gen_rtvec (4, GEN_INT (2), GEN_INT (3),
    1199              :                                               GEN_INT (4), GEN_INT (5)));
    1200          121 :           op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
    1201          121 :           op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
    1202              :         }
    1203              :       else
    1204              :         {
    1205         2359 :           int sz = GET_MODE_SIZE (mode);
    1206              : 
    1207         2359 :           if (sz == 4)
    1208          239 :             mask = gen_rtx_PARALLEL (VOIDmode,
    1209              :                                      gen_rtvec (4, GEN_INT (1), GEN_INT (0),
    1210              :                                                 GEN_INT (0), GEN_INT (1)));
    1211         2120 :           else if (sz == 8)
    1212         2120 :             mask = gen_rtx_PARALLEL (VOIDmode,
    1213              :                                      gen_rtvec (4, GEN_INT (2), GEN_INT (3),
    1214              :                                                 GEN_INT (0), GEN_INT (1)));
    1215              :           else
    1216            0 :             gcc_unreachable ();
    1217              : 
    1218         2359 :           dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
    1219         2359 :           op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
    1220              :         }
    1221              : 
    1222         2480 :       insn = gen_rtx_SET (dest, op1);
    1223         2480 :       emit_insn (insn);
    1224              :     }
    1225         6063 : }
    1226              : 
    1227              : /* Helper function of ix86_fixup_binary_operands to canonicalize
    1228              :    operand order.  Returns true if the operands should be swapped.  */
    1229              : 
    1230              : static bool
    1231    174495999 : ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
    1232              :                              rtx operands[])
    1233              : {
    1234    174495999 :   rtx dst = operands[0];
    1235    174495999 :   rtx src1 = operands[1];
    1236    174495999 :   rtx src2 = operands[2];
    1237              : 
    1238              :   /* If the operation is not commutative, we can't do anything.  */
    1239    174495999 :   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
    1240     26817930 :       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
    1241              :     return false;
    1242              : 
    1243              :   /* Highest priority is that src1 should match dst.  */
    1244    147689515 :   if (rtx_equal_p (dst, src1))
    1245              :     return false;
    1246    107333356 :   if (rtx_equal_p (dst, src2))
    1247              :     return true;
    1248              : 
    1249              :   /* Next highest priority is that immediate constants come second.  */
    1250    107248612 :   if (immediate_operand (src2, mode))
    1251              :     return false;
    1252     25900369 :   if (immediate_operand (src1, mode))
    1253              :     return true;
    1254              : 
    1255              :   /* Lowest priority is that memory references should come second.  */
    1256     25900369 :   if (MEM_P (src2))
    1257              :     return false;
    1258     24480677 :   if (MEM_P (src1))
    1259              :     return true;
    1260              : 
    1261              :   return false;
    1262              : }
    1263              : 
    1264              : /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
    1265              :    destination to use for the operation.  If different from the true
    1266              :    destination in operands[0], a copy operation will be required except
    1267              :    under TARGET_APX_NDD.  */
    1268              : 
    1269              : rtx
    1270     13517531 : ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
    1271              :                             rtx operands[], bool use_ndd)
    1272              : {
    1273     13517531 :   rtx dst = operands[0];
    1274     13517531 :   rtx src1 = operands[1];
    1275     13517531 :   rtx src2 = operands[2];
    1276              : 
    1277              :   /* Canonicalize operand order.  */
    1278     13517531 :   if (ix86_swap_binary_operands_p (code, mode, operands))
    1279              :     {
    1280              :       /* It is invalid to swap operands of different modes.  */
    1281        87855 :       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
    1282              : 
    1283              :       std::swap (src1, src2);
    1284              :     }
    1285              : 
    1286              :   /* Both source operands cannot be in memory.  */
    1287     13517531 :   if (MEM_P (src1) && MEM_P (src2))
    1288              :     {
    1289              :       /* Optimization: Only read from memory once.  */
    1290       112049 :       if (rtx_equal_p (src1, src2))
    1291              :         {
    1292           17 :           src2 = force_reg (mode, src2);
    1293           17 :           src1 = src2;
    1294              :         }
    1295       112032 :       else if (rtx_equal_p (dst, src1))
    1296         3294 :         src2 = force_reg (mode, src2);
    1297              :       else
    1298       108738 :         src1 = force_reg (mode, src1);
    1299              :     }
    1300              : 
    1301              :   /* If the destination is memory, and we do not have matching source
    1302              :      operands, do things in registers.  */
    1303     13517531 :   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
    1304       484678 :     dst = gen_reg_rtx (mode);
    1305              : 
    1306              :   /* Source 1 cannot be a constant.  */
    1307     13517531 :   if (CONSTANT_P (src1))
    1308          713 :     src1 = force_reg (mode, src1);
    1309              : 
    1310              :   /* Source 1 cannot be a non-matching memory.  */
    1311     13517531 :   if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
    1312       448752 :     src1 = force_reg (mode, src1);
    1313              : 
    1314              :   /* Improve address combine.  */
    1315     13517531 :   if (code == PLUS
    1316      9926379 :       && GET_MODE_CLASS (mode) == MODE_INT
    1317      9815933 :       && MEM_P (src2))
    1318       175990 :     src2 = force_reg (mode, src2);
    1319              : 
    1320     13517531 :   operands[1] = src1;
    1321     13517531 :   operands[2] = src2;
    1322     13517531 :   return dst;
    1323              : }
    1324              : 
    1325              : /* Similarly, but assume that the destination has already been
    1326              :    set up properly.  */
    1327              : 
    1328              : void
    1329       288852 : ix86_fixup_binary_operands_no_copy (enum rtx_code code,
    1330              :                                     machine_mode mode, rtx operands[],
    1331              :                                     bool use_ndd)
    1332              : {
    1333       288852 :   rtx dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
    1334       288852 :   gcc_assert (dst == operands[0]);
    1335       288852 : }
    1336              : 
    1337              : /* Attempt to expand a binary operator.  Make the expansion closer to the
    1338              :    actual machine, then just general_operand, which will allow 3 separate
    1339              :    memory references (one output, two input) in a single insn.  */
    1340              : 
    1341              : void
    1342     13228550 : ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
    1343              :                              rtx operands[], bool use_ndd)
    1344              : {
    1345     13228550 :   rtx src1, src2, dst, op, clob;
    1346              : 
    1347     13228550 :   dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
    1348     13228550 :   src1 = operands[1];
    1349     13228550 :   src2 = operands[2];
    1350              : 
    1351              :  /* Emit the instruction.  */
    1352              : 
    1353     13228550 :   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
    1354              : 
    1355     13228550 :   if (reload_completed
    1356        82347 :       && code == PLUS
    1357          904 :       && !rtx_equal_p (dst, src1)
    1358     13228550 :       && !use_ndd)
    1359              :     {
    1360              :       /* This is going to be an LEA; avoid splitting it later.  */
    1361            0 :       emit_insn (op);
    1362              :     }
    1363              :   else
    1364              :     {
    1365     13228550 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1366     13228550 :       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1367              :     }
    1368              : 
    1369              :   /* Fix up the destination if needed.  */
    1370     13228550 :   if (dst != operands[0])
    1371       484669 :     emit_move_insn (operands[0], dst);
    1372     13228550 : }
    1373              : 
    1374              : /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
    1375              :    the given OPERANDS.  */
    1376              : 
    1377              : void
    1378        82403 : ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
    1379              :                                      rtx operands[])
    1380              : {
    1381        82403 :   rtx op1 = NULL_RTX, op2 = NULL_RTX;
    1382        82403 :   if (SUBREG_P (operands[1]))
    1383              :     {
    1384          312 :       op1 = operands[1];
    1385          312 :       op2 = operands[2];
    1386              :     }
    1387        82091 :   else if (SUBREG_P (operands[2]))
    1388              :     {
    1389              :       op1 = operands[2];
    1390              :       op2 = operands[1];
    1391              :     }
    1392              :   /* Optimize (__m128i) d | (__m128i) e and similar code
    1393              :      when d and e are float vectors into float vector logical
    1394              :      insn.  In C/C++ without using intrinsics there is no other way
    1395              :      to express vector logical operation on float vectors than
    1396              :      to cast them temporarily to integer vectors.  */
    1397         3081 :   if (op1
    1398         3081 :       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
    1399         3081 :       && (SUBREG_P (op2) || CONST_VECTOR_P (op2))
    1400          298 :       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
    1401          303 :       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
    1402          101 :       && SUBREG_BYTE (op1) == 0
    1403          101 :       && (CONST_VECTOR_P (op2)
    1404            1 :           || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
    1405            1 :               && SUBREG_BYTE (op2) == 0))
    1406          101 :       && can_create_pseudo_p ())
    1407              :     {
    1408          101 :       rtx dst;
    1409          101 :       switch (GET_MODE (SUBREG_REG (op1)))
    1410              :         {
    1411           17 :         case E_V4SFmode:
    1412           17 :         case E_V8SFmode:
    1413           17 :         case E_V16SFmode:
    1414           17 :         case E_V2DFmode:
    1415           17 :         case E_V4DFmode:
    1416           17 :         case E_V8DFmode:
    1417           17 :           dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
    1418           17 :           if (CONST_VECTOR_P (op2))
    1419              :             {
    1420           16 :               op2 = gen_lowpart (GET_MODE (dst), op2);
    1421           16 :               op2 = force_reg (GET_MODE (dst), op2);
    1422              :             }
    1423              :           else
    1424              :             {
    1425            1 :               op1 = operands[1];
    1426            1 :               op2 = SUBREG_REG (operands[2]);
    1427            1 :               if (!vector_operand (op2, GET_MODE (dst)))
    1428            0 :                 op2 = force_reg (GET_MODE (dst), op2);
    1429              :             }
    1430           17 :           op1 = SUBREG_REG (op1);
    1431           17 :           if (!vector_operand (op1, GET_MODE (dst)))
    1432            0 :             op1 = force_reg (GET_MODE (dst), op1);
    1433           17 :           emit_insn (gen_rtx_SET (dst,
    1434              :                                   gen_rtx_fmt_ee (code, GET_MODE (dst),
    1435              :                                                   op1, op2)));
    1436           17 :           emit_move_insn (operands[0], gen_lowpart (mode, dst));
    1437           17 :           return;
    1438              :         default:
    1439              :           break;
    1440              :         }
    1441              :     }
    1442        82386 :   if (!vector_operand (operands[1], mode))
    1443            0 :     operands[1] = force_reg (mode, operands[1]);
    1444        82386 :   if (!vector_operand (operands[2], mode))
    1445        11108 :     operands[2] = force_reg (mode, operands[2]);
    1446        82386 :   ix86_fixup_binary_operands_no_copy (code, mode, operands);
    1447        82386 :   emit_insn (gen_rtx_SET (operands[0],
    1448              :                           gen_rtx_fmt_ee (code, mode, operands[1],
    1449              :                                           operands[2])));
    1450              : }
    1451              : 
    1452              : /* Return TRUE or FALSE depending on whether the binary operator meets the
    1453              :    appropriate constraints.  */
    1454              : 
    1455              : bool
    1456    161997066 : ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
    1457              :                          rtx operands[3], bool use_ndd)
    1458              : {
    1459    161997066 :   rtx dst = operands[0];
    1460    161997066 :   rtx src1 = operands[1];
    1461    161997066 :   rtx src2 = operands[2];
    1462              : 
    1463              :   /* Both source operands cannot be in memory.  */
    1464    154473172 :   if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
    1465    161997451 :       && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
    1466      1018598 :     return false;
    1467              : 
    1468              :   /* Canonicalize operand order for commutative operators.  */
    1469    160978468 :   if (ix86_swap_binary_operands_p (code, mode, operands))
    1470       537827 :     std::swap (src1, src2);
    1471              : 
    1472              :   /* If the destination is memory, we must have a matching source operand.  */
    1473    160978468 :   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
    1474              :     return false;
    1475              : 
    1476              :   /* Source 1 cannot be a constant.  */
    1477    155927318 :   if (CONSTANT_P (src1))
    1478              :     return false;
    1479              : 
    1480              :   /* Source 1 cannot be a non-matching memory.  */
    1481    155924269 :   if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
    1482              :     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
    1483      4531324 :     return (code == AND
    1484       591141 :             && (mode == HImode
    1485       591141 :                 || mode == SImode
    1486       361743 :                 || (TARGET_64BIT && mode == DImode))
    1487      4854637 :             && satisfies_constraint_L (src2));
    1488              : 
    1489              :   return true;
    1490              : }
    1491              : 
    1492              : /* Attempt to expand a unary operator.  Make the expansion closer to the
    1493              :    actual machine, then just general_operand, which will allow 2 separate
    1494              :    memory references (one output, one input) in a single insn.  */
    1495              : 
    1496              : void
    1497       118215 : ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
    1498              :                             rtx operands[], bool use_ndd)
    1499              : {
    1500       118215 :   bool matching_memory = false;
    1501       118215 :   rtx src, dst, op, clob;
    1502              : 
    1503       118215 :   dst = operands[0];
    1504       118215 :   src = operands[1];
    1505              : 
    1506              :   /* If the destination is memory, and we do not have matching source
    1507              :      operands, do things in registers.  */
    1508       118215 :   if (MEM_P (dst))
    1509              :     {
    1510         3205 :       if (rtx_equal_p (dst, src))
    1511              :         matching_memory = true;
    1512              :       else
    1513         2890 :         dst = gen_reg_rtx (mode);
    1514              :     }
    1515              : 
    1516              :   /* When source operand is memory, destination must match.  */
    1517       118215 :   if (!use_ndd && MEM_P (src) && !matching_memory)
    1518         4680 :     src = force_reg (mode, src);
    1519              : 
    1520              :   /* Emit the instruction.  */
    1521              : 
    1522       118215 :   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
    1523              : 
    1524       118215 :   if (code == NOT)
    1525        67749 :     emit_insn (op);
    1526              :   else
    1527              :     {
    1528        50466 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1529        50466 :       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1530              :     }
    1531              : 
    1532              :   /* Fix up the destination if needed.  */
    1533       118215 :   if (dst != operands[0])
    1534         2890 :     emit_move_insn (operands[0], dst);
    1535       118215 : }
    1536              : 
    1537              : /* Return TRUE or FALSE depending on whether the unary operator meets the
    1538              :    appropriate constraints.  */
    1539              : 
    1540              : bool
    1541      1710713 : ix86_unary_operator_ok (enum rtx_code,
    1542              :                         machine_mode,
    1543              :                         rtx operands[2],
    1544              :                         bool use_ndd)
    1545              : {
    1546              :   /* If one of operands is memory, source and destination must match.  */
    1547      1710713 :   if ((MEM_P (operands[0])
    1548      1667959 :        || (!use_ndd && MEM_P (operands[1])))
    1549      1739573 :       && ! rtx_equal_p (operands[0], operands[1]))
    1550              :     return false;
    1551              :   return true;
    1552              : }
    1553              : 
    1554              : /* Predict just emitted jump instruction to be taken with probability PROB.  */
    1555              : 
    1556              : static void
    1557        66018 : predict_jump (int prob)
    1558              : {
    1559        66018 :   rtx_insn *insn = get_last_insn ();
    1560        66018 :   gcc_assert (JUMP_P (insn));
    1561        66018 :   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
    1562        66018 : }
    1563              : 
    1564              : /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
    1565              :    divisor are within the range [0-255].  */
    1566              : 
    1567              : void
    1568           27 : ix86_split_idivmod (machine_mode mode, rtx operands[],
    1569              :                     bool unsigned_p)
    1570              : {
    1571           27 :   rtx_code_label *end_label, *qimode_label;
    1572           27 :   rtx div, mod;
    1573           27 :   rtx_insn *insn;
    1574           27 :   rtx scratch, tmp0, tmp1, tmp2;
    1575           27 :   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
    1576              : 
    1577           27 :   operands[2] = force_reg (mode, operands[2]);
    1578           27 :   operands[3] = force_reg (mode, operands[3]);
    1579              : 
    1580           27 :   switch (mode)
    1581              :     {
    1582           20 :     case E_SImode:
    1583           20 :       if (GET_MODE (operands[0]) == SImode)
    1584              :         {
    1585           16 :           if (GET_MODE (operands[1]) == SImode)
    1586           14 :             gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
    1587              :           else
    1588            2 :             gen_divmod4_1
    1589            2 :               = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
    1590              :         }
    1591              :       else
    1592            4 :         gen_divmod4_1
    1593            4 :           = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
    1594              :       break;
    1595              : 
    1596            7 :     case E_DImode:
    1597            7 :       gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
    1598              :       break;
    1599              : 
    1600            0 :     default:
    1601            0 :       gcc_unreachable ();
    1602              :     }
    1603              : 
    1604           27 :   end_label = gen_label_rtx ();
    1605           27 :   qimode_label = gen_label_rtx ();
    1606              : 
    1607           27 :   scratch = gen_reg_rtx (mode);
    1608              : 
    1609              :   /* Use 8bit unsigned divimod if dividend and divisor are within
    1610              :      the range [0-255].  */
    1611           27 :   emit_move_insn (scratch, operands[2]);
    1612           27 :   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
    1613              :                                  scratch, 1, OPTAB_DIRECT);
    1614           27 :   emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
    1615           27 :   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
    1616           27 :   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
    1617           27 :   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
    1618              :                                gen_rtx_LABEL_REF (VOIDmode, qimode_label),
    1619              :                                pc_rtx);
    1620           27 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
    1621           27 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
    1622           27 :   JUMP_LABEL (insn) = qimode_label;
    1623              : 
    1624              :   /* Generate original signed/unsigned divimod.  */
    1625           27 :   emit_insn (gen_divmod4_1 (operands[0], operands[1],
    1626              :                             operands[2], operands[3]));
    1627              : 
    1628              :   /* Branch to the end.  */
    1629           27 :   emit_jump_insn (gen_jump (end_label));
    1630           27 :   emit_barrier ();
    1631              : 
    1632              :   /* Generate 8bit unsigned divide.  */
    1633           27 :   emit_label (qimode_label);
    1634              :   /* Don't use operands[0] for result of 8bit divide since not all
    1635              :      registers support QImode ZERO_EXTRACT.  */
    1636           27 :   tmp0 = lowpart_subreg (HImode, scratch, mode);
    1637           27 :   tmp1 = lowpart_subreg (HImode, operands[2], mode);
    1638           27 :   tmp2 = lowpart_subreg (QImode, operands[3], mode);
    1639           27 :   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
    1640              : 
    1641           27 :   if (unsigned_p)
    1642              :     {
    1643           12 :       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
    1644           12 :       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
    1645              :     }
    1646              :   else
    1647              :     {
    1648           15 :       div = gen_rtx_DIV (mode, operands[2], operands[3]);
    1649           15 :       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
    1650              :     }
    1651           27 :   if (mode == SImode)
    1652              :     {
    1653           20 :       if (GET_MODE (operands[0]) != SImode)
    1654            4 :         div = gen_rtx_ZERO_EXTEND (DImode, div);
    1655           20 :       if (GET_MODE (operands[1]) != SImode)
    1656            2 :         mod = gen_rtx_ZERO_EXTEND (DImode, mod);
    1657              :     }
    1658              : 
    1659              :   /* Extract remainder from AH.  */
    1660           27 :   scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
    1661           27 :   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
    1662              :                                GEN_INT (8), GEN_INT (8));
    1663           27 :   insn = emit_move_insn (operands[1], tmp1);
    1664           27 :   set_unique_reg_note (insn, REG_EQUAL, mod);
    1665              : 
    1666              :   /* Zero extend quotient from AL.  */
    1667           27 :   tmp1 = gen_lowpart (QImode, tmp0);
    1668           27 :   insn = emit_insn (gen_extend_insn
    1669           27 :                     (operands[0], tmp1,
    1670           27 :                      GET_MODE (operands[0]), QImode, 1));
    1671           27 :   set_unique_reg_note (insn, REG_EQUAL, div);
    1672              : 
    1673           27 :   emit_label (end_label);
    1674           27 : }
    1675              : 
    1676              : /* Emit x86 binary operand CODE in mode MODE, where the first operand
    1677              :    matches destination.  RTX includes clobber of FLAGS_REG.  */
    1678              : 
    1679              : void
    1680         7890 : ix86_emit_binop (enum rtx_code code, machine_mode mode,
    1681              :                  rtx dst, rtx src)
    1682              : {
    1683         7890 :   rtx op, clob;
    1684              : 
    1685         7890 :   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
    1686         7890 :   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    1687              : 
    1688         7890 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
    1689         7890 : }
    1690              : 
    1691              : /* Return true if regno1 def is nearest to the insn.  */
    1692              : 
    1693              : static bool
    1694           15 : find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
    1695              : {
    1696           15 :   rtx_insn *prev = insn;
    1697           15 :   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
    1698              : 
    1699           15 :   if (insn == start)
    1700              :     return false;
    1701           40 :   while (prev && prev != start)
    1702              :     {
    1703           30 :       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
    1704              :         {
    1705           10 :           prev = PREV_INSN (prev);
    1706           10 :           continue;
    1707              :         }
    1708           20 :       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
    1709              :         return true;
    1710           15 :       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
    1711              :         return false;
    1712           15 :       prev = PREV_INSN (prev);
    1713              :     }
    1714              : 
    1715              :   /* None of the regs is defined in the bb.  */
    1716              :   return false;
    1717              : }
    1718              : 
    1719              : /* INSN_UID of the last insn emitted by zero store peephole2s.  */
    1720              : int ix86_last_zero_store_uid;
    1721              : 
    1722              : /* Split lea instructions into a sequence of instructions
    1723              :    which are executed on ALU to avoid AGU stalls.
    1724              :    It is assumed that it is allowed to clobber flags register
    1725              :    at lea position.  */
    1726              : 
    1727              : void
    1728         6071 : ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
    1729              : {
    1730         6071 :   unsigned int regno0, regno1, regno2;
    1731         6071 :   struct ix86_address parts;
    1732         6071 :   rtx target, tmp;
    1733         6071 :   int ok, adds;
    1734              : 
    1735         6071 :   ok = ix86_decompose_address (operands[1], &parts);
    1736         6071 :   gcc_assert (ok);
    1737              : 
    1738         6071 :   target = gen_lowpart (mode, operands[0]);
    1739              : 
    1740         6071 :   regno0 = true_regnum (target);
    1741         6071 :   regno1 = INVALID_REGNUM;
    1742         6071 :   regno2 = INVALID_REGNUM;
    1743              : 
    1744         6071 :   if (parts.base)
    1745              :     {
    1746         6063 :       parts.base = gen_lowpart (mode, parts.base);
    1747         6063 :       regno1 = true_regnum (parts.base);
    1748              :     }
    1749              : 
    1750         6071 :   if (parts.index)
    1751              :     {
    1752         6068 :       parts.index = gen_lowpart (mode, parts.index);
    1753         6068 :       regno2 = true_regnum (parts.index);
    1754              :     }
    1755              : 
    1756         6071 :   if (parts.disp)
    1757          167 :     parts.disp = gen_lowpart (mode, parts.disp);
    1758              : 
    1759         6071 :   if (parts.scale > 1)
    1760              :     {
    1761              :       /* Case r1 = r1 + ...  */
    1762           11 :       if (regno1 == regno0)
    1763              :         {
    1764              :           /* If we have a case r1 = r1 + C * r2 then we
    1765              :              should use multiplication which is very
    1766              :              expensive.  Assume cost model is wrong if we
    1767              :              have such case here.  */
    1768            0 :           gcc_assert (regno2 != regno0);
    1769              : 
    1770            0 :           for (adds = parts.scale; adds > 0; adds--)
    1771            0 :             ix86_emit_binop (PLUS, mode, target, parts.index);
    1772              :         }
    1773              :       else
    1774              :         {
    1775              :           /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
    1776           11 :           if (regno0 != regno2)
    1777            8 :             emit_insn (gen_rtx_SET (target, parts.index));
    1778              : 
    1779              :           /* Use shift for scaling, but emit it as MULT instead
    1780              :              to avoid it being immediately peephole2 optimized back
    1781              :              into lea.  */
    1782           11 :           ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
    1783              : 
    1784           11 :           if (parts.base)
    1785            3 :             ix86_emit_binop (PLUS, mode, target, parts.base);
    1786              : 
    1787           11 :           if (parts.disp && parts.disp != const0_rtx)
    1788            3 :             ix86_emit_binop (PLUS, mode, target, parts.disp);
    1789              :         }
    1790              :     }
    1791         6060 :   else if (!parts.base && !parts.index)
    1792              :     {
    1793            0 :       gcc_assert(parts.disp);
    1794            0 :       emit_insn (gen_rtx_SET (target, parts.disp));
    1795              :     }
    1796              :   else
    1797              :     {
    1798         6060 :       if (!parts.base)
    1799              :         {
    1800            0 :           if (regno0 != regno2)
    1801            0 :             emit_insn (gen_rtx_SET (target, parts.index));
    1802              :         }
    1803         6060 :       else if (!parts.index)
    1804              :         {
    1805            3 :           if (regno0 != regno1)
    1806            1 :             emit_insn (gen_rtx_SET (target, parts.base));
    1807              :         }
    1808              :       else
    1809              :         {
    1810         6057 :           if (regno0 == regno1)
    1811              :             tmp = parts.index;
    1812         3257 :           else if (regno0 == regno2)
    1813              :             tmp = parts.base;
    1814              :           else
    1815              :             {
    1816           15 :               rtx tmp1;
    1817              : 
    1818              :               /* Find better operand for SET instruction, depending
    1819              :                  on which definition is farther from the insn.  */
    1820           15 :               if (find_nearest_reg_def (insn, regno1, regno2))
    1821            5 :                 tmp = parts.index, tmp1 = parts.base;
    1822              :               else
    1823           10 :                 tmp = parts.base, tmp1 = parts.index;
    1824              : 
    1825           15 :               emit_insn (gen_rtx_SET (target, tmp));
    1826              : 
    1827           15 :               if (parts.disp && parts.disp != const0_rtx)
    1828            0 :                 ix86_emit_binop (PLUS, mode, target, parts.disp);
    1829              : 
    1830           15 :               ix86_emit_binop (PLUS, mode, target, tmp1);
    1831           15 :               return;
    1832              :             }
    1833              : 
    1834         6042 :           ix86_emit_binop (PLUS, mode, target, tmp);
    1835              :         }
    1836              : 
    1837         6045 :       if (parts.disp && parts.disp != const0_rtx)
    1838            4 :         ix86_emit_binop (PLUS, mode, target, parts.disp);
    1839              :     }
    1840              : }
    1841              : 
    1842              : /* Post-reload splitter for converting an SF or DFmode value in an
    1843              :    SSE register into an unsigned SImode.  */
    1844              : 
    1845              : void
    1846            0 : ix86_split_convert_uns_si_sse (rtx operands[])
    1847              : {
    1848            0 :   machine_mode vecmode;
    1849            0 :   rtx value, large, zero_or_two31, input, two31, x;
    1850              : 
    1851            0 :   large = operands[1];
    1852            0 :   zero_or_two31 = operands[2];
    1853            0 :   input = operands[3];
    1854            0 :   two31 = operands[4];
    1855            0 :   vecmode = GET_MODE (large);
    1856            0 :   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
    1857              : 
    1858              :   /* Load up the value into the low element.  We must ensure that the other
    1859              :      elements are valid floats -- zero is the easiest such value.  */
    1860            0 :   if (MEM_P (input))
    1861              :     {
    1862            0 :       if (vecmode == V4SFmode)
    1863            0 :         emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
    1864              :       else
    1865            0 :         emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
    1866              :     }
    1867              :   else
    1868              :     {
    1869            0 :       input = gen_rtx_REG (vecmode, REGNO (input));
    1870            0 :       emit_move_insn (value, CONST0_RTX (vecmode));
    1871            0 :       if (vecmode == V4SFmode)
    1872            0 :         emit_insn (gen_sse_movss_v4sf (value, value, input));
    1873              :       else
    1874            0 :         emit_insn (gen_sse2_movsd_v2df (value, value, input));
    1875              :     }
    1876              : 
    1877            0 :   emit_move_insn (large, two31);
    1878            0 :   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
    1879              : 
    1880            0 :   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
    1881            0 :   emit_insn (gen_rtx_SET (large, x));
    1882              : 
    1883            0 :   x = gen_rtx_AND (vecmode, zero_or_two31, large);
    1884            0 :   emit_insn (gen_rtx_SET (zero_or_two31, x));
    1885              : 
    1886            0 :   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
    1887            0 :   emit_insn (gen_rtx_SET (value, x));
    1888              : 
    1889            0 :   large = gen_rtx_REG (V4SImode, REGNO (large));
    1890            0 :   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
    1891              : 
    1892            0 :   x = gen_rtx_REG (V4SImode, REGNO (value));
    1893            0 :   if (vecmode == V4SFmode)
    1894            0 :     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
    1895              :   else
    1896            0 :     emit_insn (gen_sse2_cvttpd2dq (x, value));
    1897            0 :   value = x;
    1898              : 
    1899            0 :   emit_insn (gen_xorv4si3 (value, value, large));
    1900            0 : }
    1901              : 
    1902              : /* Convert an unsigned DImode value into a DFmode, using only SSE.
    1903              :    Expects the 64-bit DImode to be supplied in a pair of integral
    1904              :    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
    1905              :    -mfpmath=sse, !optimize_size only.  */
    1906              : 
    1907              : void
    1908            0 : ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
    1909              : {
    1910            0 :   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
    1911            0 :   rtx int_xmm, fp_xmm;
    1912            0 :   rtx biases, exponents;
    1913            0 :   rtx x;
    1914              : 
    1915            0 :   int_xmm = gen_reg_rtx (V4SImode);
    1916            0 :   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
    1917            0 :     emit_insn (gen_movdi_to_sse (int_xmm, input));
    1918            0 :   else if (TARGET_SSE_SPLIT_REGS)
    1919              :     {
    1920            0 :       emit_clobber (int_xmm);
    1921            0 :       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
    1922              :     }
    1923              :   else
    1924              :     {
    1925            0 :       x = gen_reg_rtx (V2DImode);
    1926            0 :       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
    1927            0 :       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
    1928              :     }
    1929              : 
    1930            0 :   x = gen_rtx_CONST_VECTOR (V4SImode,
    1931              :                             gen_rtvec (4, GEN_INT (0x43300000UL),
    1932              :                                        GEN_INT (0x45300000UL),
    1933              :                                        const0_rtx, const0_rtx));
    1934            0 :   exponents = validize_mem (force_const_mem (V4SImode, x));
    1935              : 
    1936              :   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
    1937            0 :   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
    1938              : 
    1939              :   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
    1940              :      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
    1941              :      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
    1942              :      (0x1.0p84 + double(fp_value_hi_xmm)).
    1943              :      Note these exponents differ by 32.  */
    1944              : 
    1945            0 :   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
    1946              : 
    1947              :   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
    1948              :      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
    1949            0 :   real_ldexp (&bias_lo_rvt, &dconst1, 52);
    1950            0 :   real_ldexp (&bias_hi_rvt, &dconst1, 84);
    1951            0 :   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
    1952            0 :   x = const_double_from_real_value (bias_hi_rvt, DFmode);
    1953            0 :   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
    1954            0 :   biases = validize_mem (force_const_mem (V2DFmode, biases));
    1955            0 :   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
    1956              : 
    1957              :   /* Add the upper and lower DFmode values together.  */
    1958            0 :   if (TARGET_SSE3)
    1959            0 :     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
    1960              :   else
    1961              :     {
    1962            0 :       x = copy_to_mode_reg (V2DFmode, fp_xmm);
    1963            0 :       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
    1964            0 :       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
    1965              :     }
    1966              : 
    1967            0 :   ix86_expand_vector_extract (false, target, fp_xmm, 0);
    1968            0 : }
    1969              : 
    1970              : /* Not used, but eases macroization of patterns.  */
    1971              : void
    1972            0 : ix86_expand_convert_uns_sixf_sse (rtx, rtx)
    1973              : {
    1974            0 :   gcc_unreachable ();
    1975              : }
    1976              : 
    1977              : static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
    1978              : 
    1979              : /* Convert an unsigned SImode value into a DFmode.  Only currently used
    1980              :    for SSE, but applicable anywhere.  */
    1981              : 
    1982              : void
    1983            0 : ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
    1984              : {
    1985            0 :   REAL_VALUE_TYPE TWO31r;
    1986            0 :   rtx x, fp;
    1987              : 
    1988            0 :   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
    1989              :                            NULL, 1, OPTAB_DIRECT);
    1990              : 
    1991            0 :   fp = gen_reg_rtx (DFmode);
    1992            0 :   emit_insn (gen_floatsidf2 (fp, x));
    1993              : 
    1994            0 :   real_ldexp (&TWO31r, &dconst1, 31);
    1995            0 :   x = const_double_from_real_value (TWO31r, DFmode);
    1996              : 
    1997            0 :   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
    1998              : 
    1999              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
    2000            0 :   if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
    2001            0 :     x = ix86_expand_sse_fabs (x, NULL);
    2002              : 
    2003            0 :   if (x != target)
    2004            0 :     emit_move_insn (target, x);
    2005            0 : }
    2006              : 
    2007              : /* Convert a signed DImode value into a DFmode.  Only used for SSE in
    2008              :    32-bit mode; otherwise we have a direct convert instruction.  */
    2009              : 
    2010              : void
    2011            0 : ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
    2012              : {
    2013            0 :   REAL_VALUE_TYPE TWO32r;
    2014            0 :   rtx fp_lo, fp_hi, x;
    2015              : 
    2016            0 :   fp_lo = gen_reg_rtx (DFmode);
    2017            0 :   fp_hi = gen_reg_rtx (DFmode);
    2018              : 
    2019            0 :   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
    2020              : 
    2021            0 :   real_ldexp (&TWO32r, &dconst1, 32);
    2022            0 :   x = const_double_from_real_value (TWO32r, DFmode);
    2023            0 :   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
    2024              : 
    2025            0 :   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
    2026              : 
    2027            0 :   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
    2028              :                            0, OPTAB_DIRECT);
    2029            0 :   if (x != target)
    2030            0 :     emit_move_insn (target, x);
    2031            0 : }
    2032              : 
    2033              : /* Convert an unsigned SImode value into a SFmode, using only SSE.
    2034              :    For x86_32, -mfpmath=sse, !optimize_size only.  */
    2035              : void
    2036            0 : ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
    2037              : {
    2038            0 :   REAL_VALUE_TYPE ONE16r;
    2039            0 :   rtx fp_hi, fp_lo, int_hi, int_lo, x;
    2040              : 
    2041            0 :   real_ldexp (&ONE16r, &dconst1, 16);
    2042            0 :   x = const_double_from_real_value (ONE16r, SFmode);
    2043            0 :   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
    2044              :                                       NULL, 0, OPTAB_DIRECT);
    2045            0 :   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
    2046              :                                       NULL, 0, OPTAB_DIRECT);
    2047            0 :   fp_hi = gen_reg_rtx (SFmode);
    2048            0 :   fp_lo = gen_reg_rtx (SFmode);
    2049            0 :   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
    2050            0 :   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
    2051            0 :   if (TARGET_FMA)
    2052              :     {
    2053            0 :       x = validize_mem (force_const_mem (SFmode, x));
    2054            0 :       fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
    2055            0 :       emit_move_insn (target, fp_hi);
    2056              :     }
    2057              :   else
    2058              :     {
    2059            0 :       fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
    2060              :                                    0, OPTAB_DIRECT);
    2061            0 :       fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
    2062              :                                    0, OPTAB_DIRECT);
    2063            0 :       if (!rtx_equal_p (target, fp_hi))
    2064            0 :         emit_move_insn (target, fp_hi);
    2065              :     }
    2066            0 : }
    2067              : 
    2068              : /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
    2069              :    a vector of unsigned ints VAL to vector of floats TARGET.  */
    2070              : 
    2071              : void
    2072           54 : ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
    2073              : {
    2074           54 :   rtx tmp[8];
    2075           54 :   REAL_VALUE_TYPE TWO16r;
    2076           54 :   machine_mode intmode = GET_MODE (val);
    2077           54 :   machine_mode fltmode = GET_MODE (target);
    2078           54 :   rtx (*cvt) (rtx, rtx);
    2079              : 
    2080           54 :   if (intmode == V4SImode)
    2081              :     cvt = gen_floatv4siv4sf2;
    2082              :   else
    2083            2 :     cvt = gen_floatv8siv8sf2;
    2084           54 :   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
    2085           54 :   tmp[0] = force_reg (intmode, tmp[0]);
    2086           54 :   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
    2087              :                                 OPTAB_DIRECT);
    2088           54 :   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
    2089              :                                 NULL_RTX, 1, OPTAB_DIRECT);
    2090           54 :   tmp[3] = gen_reg_rtx (fltmode);
    2091           54 :   emit_insn (cvt (tmp[3], tmp[1]));
    2092           54 :   tmp[4] = gen_reg_rtx (fltmode);
    2093           54 :   emit_insn (cvt (tmp[4], tmp[2]));
    2094           54 :   real_ldexp (&TWO16r, &dconst1, 16);
    2095           54 :   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
    2096           54 :   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
    2097           54 :   if (TARGET_FMA)
    2098              :     {
    2099            1 :       tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
    2100            1 :       emit_move_insn (target, tmp[6]);
    2101              :     }
    2102              :   else
    2103              :     {
    2104           53 :       tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
    2105              :                                     NULL_RTX, 1, OPTAB_DIRECT);
    2106           53 :       tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
    2107              :                                     target, 1, OPTAB_DIRECT);
    2108           53 :       if (tmp[7] != target)
    2109            0 :         emit_move_insn (target, tmp[7]);
    2110              :     }
    2111           54 : }
    2112              : 
    2113              : /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
    2114              :    pattern can be used on it instead of fixuns_trunc*.
    2115              :    This is done by doing just signed conversion if < 0x1p31, and otherwise by
    2116              :    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
    2117              : 
    2118              : rtx
    2119          264 : ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
    2120              : {
    2121          264 :   REAL_VALUE_TYPE TWO31r;
    2122          264 :   rtx two31r, tmp[4];
    2123          264 :   machine_mode mode = GET_MODE (val);
    2124          264 :   machine_mode scalarmode = GET_MODE_INNER (mode);
    2125          528 :   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
    2126          264 :   rtx (*cmp) (rtx, rtx, rtx, rtx);
    2127          264 :   int i;
    2128              : 
    2129         1056 :   for (i = 0; i < 3; i++)
    2130          792 :     tmp[i] = gen_reg_rtx (mode);
    2131          264 :   real_ldexp (&TWO31r, &dconst1, 31);
    2132          264 :   two31r = const_double_from_real_value (TWO31r, scalarmode);
    2133          264 :   two31r = ix86_build_const_vector (mode, 1, two31r);
    2134          264 :   two31r = force_reg (mode, two31r);
    2135          264 :   switch (mode)
    2136              :     {
    2137              :     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
    2138           10 :     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
    2139           16 :     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
    2140          238 :     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
    2141            0 :     default: gcc_unreachable ();
    2142              :     }
    2143          264 :   tmp[3] = gen_rtx_LE (mode, two31r, val);
    2144          264 :   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
    2145          264 :   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
    2146              :                                 0, OPTAB_DIRECT);
    2147          264 :   if (intmode == V4SImode || TARGET_AVX2)
    2148          528 :     *xorp = expand_simple_binop (intmode, ASHIFT,
    2149          264 :                                  gen_lowpart (intmode, tmp[0]),
    2150              :                                  GEN_INT (31), NULL_RTX, 0,
    2151              :                                  OPTAB_DIRECT);
    2152              :   else
    2153              :     {
    2154            0 :       rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
    2155            0 :       two31 = ix86_build_const_vector (intmode, 1, two31);
    2156            0 :       *xorp = expand_simple_binop (intmode, AND,
    2157            0 :                                    gen_lowpart (intmode, tmp[0]),
    2158              :                                    two31, NULL_RTX, 0,
    2159              :                                    OPTAB_DIRECT);
    2160              :     }
    2161          264 :   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
    2162          264 :                               0, OPTAB_DIRECT);
    2163              : }
    2164              : 
    2165              : /* Generate code for floating point ABS or NEG.  */
    2166              : 
    2167              : void
    2168        33105 : ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
    2169              :                                 rtx operands[])
    2170              : {
    2171        33105 :   rtx set, dst, src;
    2172        33105 :   bool use_sse = false;
    2173        33105 :   bool vector_mode = VECTOR_MODE_P (mode);
    2174        33105 :   machine_mode vmode = mode;
    2175        33105 :   rtvec par;
    2176              : 
    2177        33105 :   switch (mode)
    2178              :   {
    2179              :   case E_HFmode:
    2180              :     use_sse = true;
    2181              :     vmode = V8HFmode;
    2182              :     break;
    2183            0 :   case E_BFmode:
    2184            0 :     use_sse = true;
    2185            0 :     vmode = V8BFmode;
    2186            0 :     break;
    2187         8851 :   case E_SFmode:
    2188         8851 :     use_sse = TARGET_SSE_MATH && TARGET_SSE;
    2189              :     vmode = V4SFmode;
    2190              :     break;
    2191        15386 :   case E_DFmode:
    2192        15386 :     use_sse = TARGET_SSE_MATH && TARGET_SSE2;
    2193              :     vmode = V2DFmode;
    2194              :     break;
    2195         8669 :   default:
    2196         8669 :     use_sse = vector_mode || mode == TFmode;
    2197         8669 :     break;
    2198              :   }
    2199              : 
    2200        33105 :   dst = operands[0];
    2201        33105 :   src = operands[1];
    2202              : 
    2203        33105 :   set = gen_rtx_fmt_e (code, mode, src);
    2204        33105 :   set = gen_rtx_SET (dst, set);
    2205              : 
    2206        33105 :   if (use_sse)
    2207              :     {
    2208        27728 :       rtx mask, use, clob;
    2209              : 
    2210              :       /* NEG and ABS performed with SSE use bitwise mask operations.
    2211              :          Create the appropriate mask now.  */
    2212        27728 :       mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
    2213        27728 :       use = gen_rtx_USE (VOIDmode, mask);
    2214        27728 :       if (vector_mode || mode == TFmode)
    2215         4591 :         par = gen_rtvec (2, set, use);
    2216              :       else
    2217              :         {
    2218        23137 :           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2219        23137 :           par = gen_rtvec (3, set, use, clob);
    2220              :         }
    2221              :     }
    2222              :   else
    2223              :     {
    2224         5377 :       rtx clob;
    2225              : 
    2226              :       /* Changing of sign for FP values is doable using integer unit too.  */
    2227         5377 :       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2228         5377 :       par = gen_rtvec (2, set, clob);
    2229              :     }
    2230              : 
    2231        33105 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
    2232        33105 : }
    2233              : 
    2234              : /* Deconstruct a floating point ABS or NEG operation
    2235              :    with integer registers into integer operations.  */
    2236              : 
    2237              : void
    2238           24 : ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
    2239              :                                rtx operands[])
    2240              : {
    2241           24 :   enum rtx_code absneg_op;
    2242           24 :   rtx dst, set;
    2243              : 
    2244           24 :   gcc_assert (operands_match_p (operands[0], operands[1]));
    2245              : 
    2246           24 :   switch (mode)
    2247              :     {
    2248            0 :     case E_SFmode:
    2249            0 :       dst = gen_lowpart (SImode, operands[0]);
    2250              : 
    2251            0 :       if (code == ABS)
    2252              :         {
    2253            0 :           set = gen_int_mode (0x7fffffff, SImode);
    2254            0 :           absneg_op = AND;
    2255              :         }
    2256              :       else
    2257              :         {
    2258            0 :           set = gen_int_mode (0x80000000, SImode);
    2259            0 :           absneg_op = XOR;
    2260              :         }
    2261            0 :       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2262            0 :       break;
    2263              : 
    2264            1 :     case E_DFmode:
    2265            1 :       if (TARGET_64BIT)
    2266              :         {
    2267            1 :           dst = gen_lowpart (DImode, operands[0]);
    2268            1 :           dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
    2269              : 
    2270            1 :           if (code == ABS)
    2271            0 :             set = const0_rtx;
    2272              :           else
    2273            1 :             set = gen_rtx_NOT (DImode, dst);
    2274              :         }
    2275              :       else
    2276              :         {
    2277            0 :           dst = gen_highpart (SImode, operands[0]);
    2278              : 
    2279            0 :           if (code == ABS)
    2280              :             {
    2281            0 :               set = gen_int_mode (0x7fffffff, SImode);
    2282            0 :               absneg_op = AND;
    2283              :             }
    2284              :           else
    2285              :             {
    2286            0 :               set = gen_int_mode (0x80000000, SImode);
    2287            0 :               absneg_op = XOR;
    2288              :             }
    2289            0 :           set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2290              :         }
    2291              :       break;
    2292              : 
    2293           23 :     case E_XFmode:
    2294           23 :       dst = gen_rtx_REG (SImode,
    2295           23 :                          REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
    2296           23 :       if (code == ABS)
    2297              :         {
    2298            1 :           set = GEN_INT (0x7fff);
    2299            1 :           absneg_op = AND;
    2300              :         }
    2301              :       else
    2302              :         {
    2303           22 :           set = GEN_INT (0x8000);
    2304           22 :           absneg_op = XOR;
    2305              :         }
    2306           23 :       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
    2307           23 :       break;
    2308              : 
    2309            0 :     default:
    2310            0 :       gcc_unreachable ();
    2311              :     }
    2312              : 
    2313           24 :   set = gen_rtx_SET (dst, set);
    2314              : 
    2315           24 :   rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    2316           24 :   rtvec par = gen_rtvec (2, set, clob);
    2317              : 
    2318           24 :   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
    2319           24 : }
    2320              : 
    2321              : /* Expand a copysign operation.  Special case operand 0 being a constant.  */
    2322              : 
    2323              : void
    2324        23249 : ix86_expand_copysign (rtx operands[])
    2325              : {
    2326        23249 :   machine_mode mode, vmode;
    2327        23249 :   rtx dest, vdest, op0, op1, mask, op2, op3;
    2328              : 
    2329        23249 :   mode = GET_MODE (operands[0]);
    2330              : 
    2331        23249 :   switch (mode)
    2332              :   {
    2333              :   case E_HFmode:
    2334              :     vmode = V8HFmode;
    2335              :     break;
    2336            0 :   case E_BFmode:
    2337            0 :     vmode = V8BFmode;
    2338            0 :     break;
    2339        11566 :   case E_SFmode:
    2340        11566 :     vmode = V4SFmode;
    2341        11566 :     break;
    2342        11544 :   case E_DFmode:
    2343        11544 :     vmode = V2DFmode;
    2344        11544 :     break;
    2345          127 :   case E_TFmode:
    2346          127 :     vmode = mode;
    2347          127 :     break;
    2348            0 :   default:
    2349            0 :     gcc_unreachable();
    2350              :   }
    2351              : 
    2352        23249 :   if (rtx_equal_p (operands[1], operands[2]))
    2353              :     {
    2354            0 :       emit_move_insn (operands[0], operands[1]);
    2355            0 :       return;
    2356              :     }
    2357              : 
    2358        23249 :   dest = operands[0];
    2359        23249 :   vdest = lowpart_subreg (vmode, dest, mode);
    2360        23249 :   if (vdest == NULL_RTX)
    2361            0 :     vdest = gen_reg_rtx (vmode);
    2362              :   else
    2363              :     dest = NULL_RTX;
    2364        23249 :   op1 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
    2365        46484 :   mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
    2366              : 
    2367        23249 :   if (CONST_DOUBLE_P (operands[2]))
    2368              :     {
    2369           79 :       if (real_isneg (CONST_DOUBLE_REAL_VALUE (operands[2])))
    2370              :         /* Simplify b = copysign (a, negative) to b = mask | a.  */
    2371           76 :         op1 = gen_rtx_IOR (vmode, mask, op1);
    2372              :       else
    2373              :         {
    2374              :           /* Simplify b = copysign (a, positive) to b = invert_mask & a.  */
    2375            3 :           rtx invert_mask
    2376            3 :             = ix86_build_signbit_mask (vmode,
    2377            3 :                                        TARGET_AVX512F && mode != HFmode,
    2378              :                                        true);
    2379            3 :           op1 = gen_rtx_AND (vmode, invert_mask, op1);
    2380              :         }
    2381           79 :       emit_move_insn (vdest, op1);
    2382           79 :       if (dest)
    2383            0 :         emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2384           79 :       return;
    2385              :     }
    2386              :   else
    2387        23170 :     op0 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
    2388              : 
    2389        23170 :   op2 = gen_reg_rtx (vmode);
    2390        23170 :   op3 = gen_reg_rtx (vmode);
    2391        23170 :   rtx invert_mask;
    2392              :   /* NB: Generate vmovdqa, vpandn, vpand, vpor for AVX and generate pand,
    2393              :      pand, por for SSE.  */
    2394        23170 :   if (TARGET_AVX)
    2395           33 :     invert_mask = gen_rtx_NOT (vmode, mask);
    2396              :   else
    2397        23137 :     invert_mask = ix86_build_signbit_mask (vmode,
    2398        23137 :                                            TARGET_AVX512F && mode != HFmode,
    2399              :                                            true);
    2400        23170 :   emit_move_insn (op2, gen_rtx_AND (vmode, invert_mask, op1));
    2401        23170 :   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op0));
    2402        23170 :   emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
    2403        23170 :   if (dest)
    2404            0 :     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2405              : }
    2406              : 
    2407              : /* Expand an xorsign operation.  */
    2408              : 
    2409              : void
    2410           20 : ix86_expand_xorsign (rtx operands[])
    2411              : {
    2412           20 :   machine_mode mode, vmode;
    2413           20 :   rtx dest, vdest, op0, op1, mask, x, temp;
    2414              : 
    2415           20 :   dest = operands[0];
    2416           20 :   op0 = operands[1];
    2417           20 :   op1 = operands[2];
    2418              : 
    2419           20 :   mode = GET_MODE (dest);
    2420              : 
    2421           20 :   switch (mode)
    2422              :   {
    2423              :   case E_HFmode:
    2424              :     vmode = V8HFmode;
    2425              :     break;
    2426              :   case E_BFmode:
    2427              :     vmode = V8BFmode;
    2428              :     break;
    2429              :   case E_SFmode:
    2430              :     vmode = V4SFmode;
    2431              :     break;
    2432              :   case E_DFmode:
    2433              :     vmode = V2DFmode;
    2434              :     break;
    2435            0 :   default:
    2436            0 :     gcc_unreachable ();
    2437           20 :     break;
    2438              :   }
    2439              : 
    2440           20 :   temp = gen_reg_rtx (vmode);
    2441           20 :   mask = ix86_build_signbit_mask (vmode, 0, 0);
    2442              : 
    2443           20 :   op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
    2444           20 :   x = gen_rtx_AND (vmode, op1, mask);
    2445           20 :   emit_insn (gen_rtx_SET (temp, x));
    2446              : 
    2447           20 :   op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
    2448           20 :   x = gen_rtx_XOR (vmode, temp, op0);
    2449              : 
    2450           20 :   vdest = lowpart_subreg (vmode, dest, mode);
    2451           20 :   if (vdest == NULL_RTX)
    2452            0 :     vdest = gen_reg_rtx (vmode);
    2453              :   else
    2454              :     dest = NULL_RTX;
    2455           20 :   emit_insn (gen_rtx_SET (vdest, x));
    2456              : 
    2457           20 :   if (dest)
    2458            0 :     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
    2459           20 : }
    2460              : 
    2461              : static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
    2462              : 
    2463              : void
    2464      6657123 : ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
    2465              : {
    2466      6657123 :   machine_mode mode = GET_MODE (op0);
    2467      6657123 :   rtx tmp;
    2468              : 
    2469              :   /* Handle special case - vector comparsion with boolean result, transform
    2470              :      it using ptest instruction or vpcmpeq + kortest.  */
    2471      6657123 :   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
    2472      6637928 :       || (mode == TImode && !TARGET_64BIT)
    2473      6637928 :       || mode == OImode
    2474     13295051 :       || GET_MODE_SIZE (mode) == 64)
    2475              :     {
    2476        19195 :       unsigned msize = GET_MODE_SIZE (mode);
    2477        19195 :       machine_mode p_mode
    2478        19195 :         = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
    2479              :       /* kortest set CF when result is 0xFFFF (op0 == op1).  */
    2480        19195 :       rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
    2481              : 
    2482        19195 :       gcc_assert (code == EQ || code == NE);
    2483              : 
    2484              :       /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
    2485        19195 :       if (msize == 64)
    2486              :         {
    2487         2195 :           if (mode != V16SImode)
    2488              :             {
    2489         2195 :               op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
    2490         2195 :               op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
    2491              :             }
    2492              : 
    2493         2195 :           tmp = gen_reg_rtx (HImode);
    2494         2195 :           emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
    2495         2195 :           emit_insn (gen_kortesthi_ccc (tmp, tmp));
    2496              :         }
    2497              :       /* Using ptest for 128/256-bit vectors.  */
    2498              :       else
    2499              :         {
    2500        17000 :           if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
    2501              :             {
    2502            0 :               op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
    2503            0 :               op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
    2504            0 :               mode = p_mode;
    2505              :             }
    2506              : 
    2507              :           /* Generate XOR since we can't check that one operand is zero
    2508              :              vector.  */
    2509        17000 :           tmp = gen_reg_rtx (mode);
    2510        17000 :           rtx ops[3] = { tmp, op0, op1 };
    2511        17000 :           ix86_expand_vector_logical_operator (XOR, mode, ops);
    2512        17000 :           tmp = gen_lowpart (p_mode, tmp);
    2513        17000 :           emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
    2514              :                                   gen_rtx_UNSPEC (CCZmode,
    2515              :                                                   gen_rtvec (2, tmp, tmp),
    2516              :                                                   UNSPEC_PTEST)));
    2517              :         }
    2518        19195 :       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
    2519        19195 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
    2520              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
    2521              :                                   pc_rtx);
    2522        19195 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    2523        19195 :       return;
    2524              :     }
    2525              : 
    2526      6637928 :   switch (mode)
    2527              :     {
    2528      6605030 :     case E_HFmode:
    2529      6605030 :     case E_SFmode:
    2530      6605030 :     case E_DFmode:
    2531      6605030 :     case E_XFmode:
    2532      6605030 :     case E_QImode:
    2533      6605030 :     case E_HImode:
    2534      6605030 :     case E_SImode:
    2535      6605030 :       simple:
    2536      6605030 :       tmp = ix86_expand_compare (code, op0, op1);
    2537      6605030 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
    2538              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
    2539              :                                   pc_rtx);
    2540      6605030 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    2541      6605030 :       return;
    2542              : 
    2543            7 :     case E_BFmode:
    2544            7 :       gcc_assert (TARGET_AVX10_2 && !flag_trapping_math);
    2545            7 :       goto simple;
    2546              : 
    2547      2675218 :     case E_DImode:
    2548      2675218 :       if (TARGET_64BIT)
    2549      2646729 :         goto simple;
    2550              :       /* FALLTHRU */
    2551        91358 :     case E_TImode:
    2552              :       /* DI and TI mode equality/inequality comparisons may be performed
    2553              :          on SSE registers.  Avoid splitting them, except when optimizing
    2554              :          for size.  */
    2555        91358 :       if ((code == EQ || code == NE)
    2556        91358 :           && !optimize_insn_for_size_p ())
    2557        58460 :         goto simple;
    2558              : 
    2559              :       /* Expand DImode branch into multiple compare+branch.  */
    2560        32898 :       {
    2561        32898 :         rtx lo[2], hi[2];
    2562        32898 :         rtx_code_label *label2;
    2563        32898 :         enum rtx_code code1, code2, code3;
    2564        32898 :         machine_mode submode;
    2565              : 
    2566        32898 :         if (CONSTANT_P (op0) && !CONSTANT_P (op1))
    2567              :           {
    2568            0 :             std::swap (op0, op1);
    2569            0 :             code = swap_condition (code);
    2570              :           }
    2571              : 
    2572        32898 :         split_double_mode (mode, &op0, 1, lo+0, hi+0);
    2573        32898 :         split_double_mode (mode, &op1, 1, lo+1, hi+1);
    2574              : 
    2575        32898 :         submode = mode == DImode ? SImode : DImode;
    2576              : 
    2577              :         /* If we are doing less-than or greater-or-equal-than,
    2578              :            op1 is a constant and the low word is zero, then we can just
    2579              :            examine the high word.  Similarly for low word -1 and
    2580              :            less-or-equal-than or greater-than.  */
    2581              : 
    2582        32898 :         if (CONST_INT_P (hi[1]))
    2583        21936 :           switch (code)
    2584              :             {
    2585        10682 :             case LT: case LTU: case GE: case GEU:
    2586        10682 :               if (lo[1] == const0_rtx)
    2587              :                 {
    2588        10273 :                   ix86_expand_branch (code, hi[0], hi[1], label);
    2589        10273 :                   return;
    2590              :                 }
    2591              :               break;
    2592         9681 :             case LE: case LEU: case GT: case GTU:
    2593         9681 :               if (lo[1] == constm1_rtx)
    2594              :                 {
    2595          529 :                   ix86_expand_branch (code, hi[0], hi[1], label);
    2596          529 :                   return;
    2597              :                 }
    2598              :               break;
    2599              :             default:
    2600              :               break;
    2601              :             }
    2602              : 
    2603              :         /* Emulate comparisons that do not depend on Zero flag with
    2604              :            double-word subtraction.  Note that only Overflow, Sign
    2605              :            and Carry flags are valid, so swap arguments and condition
    2606              :            of comparisons that would otherwise test Zero flag.  */
    2607              : 
    2608        22096 :         switch (code)
    2609              :           {
    2610        14640 :           case LE: case LEU: case GT: case GTU:
    2611        14640 :             std::swap (lo[0], lo[1]);
    2612        14640 :             std::swap (hi[0], hi[1]);
    2613        14640 :             code = swap_condition (code);
    2614              :             /* FALLTHRU */
    2615              : 
    2616        19030 :           case LT: case LTU: case GE: case GEU:
    2617        19030 :             {
    2618        19030 :               bool uns = (code == LTU || code == GEU);
    2619         3989 :               rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
    2620        19030 :                 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
    2621              : 
    2622        19030 :               if (!nonimmediate_operand (lo[0], submode))
    2623         9152 :                 lo[0] = force_reg (submode, lo[0]);
    2624        19030 :               if (!x86_64_general_operand (lo[1], submode))
    2625            0 :                 lo[1] = force_reg (submode, lo[1]);
    2626              : 
    2627        19030 :               if (!register_operand (hi[0], submode))
    2628         9974 :                 hi[0] = force_reg (submode, hi[0]);
    2629        15041 :               if ((uns && !nonimmediate_operand (hi[1], submode))
    2630        19030 :                   || (!uns && !x86_64_general_operand (hi[1], submode)))
    2631          315 :                 hi[1] = force_reg (submode, hi[1]);
    2632              : 
    2633        19030 :               emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
    2634              : 
    2635        19030 :               tmp = gen_rtx_SCRATCH (submode);
    2636        19030 :               emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
    2637              : 
    2638        23019 :               tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
    2639        19030 :               ix86_expand_branch (code, tmp, const0_rtx, label);
    2640        19030 :               return;
    2641              :             }
    2642              : 
    2643         3066 :           default:
    2644         3066 :             break;
    2645              :           }
    2646              : 
    2647              :         /* Otherwise, we need two or three jumps.  */
    2648              : 
    2649         3066 :         label2 = gen_label_rtx ();
    2650              : 
    2651         3066 :         code1 = code;
    2652         3066 :         code2 = swap_condition (code);
    2653         3066 :         code3 = unsigned_condition (code);
    2654              : 
    2655         3066 :         switch (code)
    2656              :           {
    2657              :           case LT: case GT: case LTU: case GTU:
    2658              :             break;
    2659              : 
    2660              :           case LE:   code1 = LT;  code2 = GT;  break;
    2661              :           case GE:   code1 = GT;  code2 = LT;  break;
    2662            0 :           case LEU:  code1 = LTU; code2 = GTU; break;
    2663            0 :           case GEU:  code1 = GTU; code2 = LTU; break;
    2664              : 
    2665              :           case EQ:   code1 = UNKNOWN; code2 = NE;  break;
    2666              :           case NE:   code2 = UNKNOWN; break;
    2667              : 
    2668            0 :           default:
    2669            0 :             gcc_unreachable ();
    2670              :           }
    2671              : 
    2672              :         /*
    2673              :          * a < b =>
    2674              :          *    if (hi(a) < hi(b)) goto true;
    2675              :          *    if (hi(a) > hi(b)) goto false;
    2676              :          *    if (lo(a) < lo(b)) goto true;
    2677              :          *  false:
    2678              :          */
    2679              : 
    2680            0 :         if (code1 != UNKNOWN)
    2681         2328 :           ix86_expand_branch (code1, hi[0], hi[1], label);
    2682         3066 :         if (code2 != UNKNOWN)
    2683          738 :           ix86_expand_branch (code2, hi[0], hi[1], label2);
    2684              : 
    2685         3066 :         ix86_expand_branch (code3, lo[0], lo[1], label);
    2686              : 
    2687         3066 :         if (code2 != UNKNOWN)
    2688          738 :           emit_label (label2);
    2689              :         return;
    2690              :       }
    2691              : 
    2692        19472 :     default:
    2693        19472 :       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
    2694        19472 :       goto simple;
    2695              :     }
    2696              : }
    2697              : 
    2698              : /* Figure out whether to use unordered fp comparisons.  */
    2699              : 
    2700              : static bool
    2701      1148001 : ix86_unordered_fp_compare (enum rtx_code code)
    2702              : {
    2703      1148001 :   if (!TARGET_IEEE_FP)
    2704              :     return false;
    2705              : 
    2706      1143619 :   switch (code)
    2707              :     {
    2708              :     case LT:
    2709              :     case LE:
    2710              :     case GT:
    2711              :     case GE:
    2712              :     case LTGT:
    2713              :       return false;
    2714              : 
    2715              :     case EQ:
    2716              :     case NE:
    2717              : 
    2718              :     case UNORDERED:
    2719              :     case ORDERED:
    2720              :     case UNLT:
    2721              :     case UNLE:
    2722              :     case UNGT:
    2723              :     case UNGE:
    2724              :     case UNEQ:
    2725              :       return true;
    2726              : 
    2727            0 :     default:
    2728            0 :       gcc_unreachable ();
    2729              :     }
    2730              : }
    2731              : 
    2732              : /* Return a comparison we can do and that it is equivalent to
    2733              :    swap_condition (code) apart possibly from orderedness.
    2734              :    But, never change orderedness if TARGET_IEEE_FP, returning
    2735              :    UNKNOWN in that case if necessary.  */
    2736              : 
    2737              : static enum rtx_code
    2738        37454 : ix86_fp_swap_condition (enum rtx_code code)
    2739              : {
    2740        37454 :   switch (code)
    2741              :     {
    2742         1847 :     case GT:                   /* GTU - CF=0 & ZF=0 */
    2743         1847 :       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
    2744          533 :     case GE:                   /* GEU - CF=0 */
    2745          533 :       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
    2746          446 :     case UNLT:                 /* LTU - CF=1 */
    2747          446 :       return TARGET_IEEE_FP ? UNKNOWN : GT;
    2748         6315 :     case UNLE:                 /* LEU - CF=1 | ZF=1 */
    2749         6315 :       return TARGET_IEEE_FP ? UNKNOWN : GE;
    2750        28313 :     default:
    2751        28313 :       return swap_condition (code);
    2752              :     }
    2753              : }
    2754              : 
    2755              : /* Return cost of comparison CODE using the best strategy for performance.
    2756              :    All following functions do use number of instructions as a cost metrics.
    2757              :    In future this should be tweaked to compute bytes for optimize_size and
    2758              :    take into account performance of various instructions on various CPUs.  */
    2759              : 
    2760              : static int
    2761      1146866 : ix86_fp_comparison_cost (enum rtx_code code)
    2762              : {
    2763      1146866 :   int arith_cost;
    2764              : 
    2765              :   /* The cost of code using bit-twiddling on %ah.  */
    2766      1146866 :   switch (code)
    2767              :     {
    2768              :     case UNLE:
    2769              :     case UNLT:
    2770              :     case LTGT:
    2771              :     case GT:
    2772              :     case GE:
    2773              :     case UNORDERED:
    2774              :     case ORDERED:
    2775              :     case UNEQ:
    2776              :       arith_cost = 4;
    2777              :       break;
    2778        84147 :     case LT:
    2779        84147 :     case NE:
    2780        84147 :     case EQ:
    2781        84147 :     case UNGE:
    2782        84147 :       arith_cost = TARGET_IEEE_FP ? 5 : 4;
    2783              :       break;
    2784        25472 :     case LE:
    2785        25472 :     case UNGT:
    2786      1063530 :       arith_cost = TARGET_IEEE_FP ? 6 : 4;
    2787              :       break;
    2788            0 :     default:
    2789            0 :       gcc_unreachable ();
    2790              :     }
    2791              : 
    2792      1146866 :   switch (ix86_fp_comparison_strategy (code))
    2793              :     {
    2794      1146866 :     case IX86_FPCMP_COMI:
    2795      1146866 :       return arith_cost > 4 ? 3 : 2;
    2796            0 :     case IX86_FPCMP_SAHF:
    2797            0 :       return arith_cost > 4 ? 4 : 3;
    2798              :     default:
    2799              :       return arith_cost;
    2800              :     }
    2801              : }
    2802              : 
    2803              : /* Swap, force into registers, or otherwise massage the two operands
    2804              :    to a fp comparison.  The operands are updated in place; the new
    2805              :    comparison code is returned.  */
    2806              : 
    2807              : static enum rtx_code
    2808       573433 : ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
    2809              : {
    2810       573504 :   bool unordered_compare = ix86_unordered_fp_compare (code);
    2811       573504 :   rtx op0 = *pop0, op1 = *pop1;
    2812       573504 :   machine_mode op_mode = GET_MODE (op0);
    2813       573504 :   bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (op_mode);
    2814              : 
    2815       571113 :   if (op_mode == BFmode && (!TARGET_AVX10_2 || flag_trapping_math))
    2816              :     {
    2817           71 :       rtx op = gen_lowpart (HImode, op0);
    2818           71 :       if (CONST_INT_P (op))
    2819            0 :         op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
    2820              :                                              op0, BFmode);
    2821              :       else
    2822              :         {
    2823           71 :           rtx t1 = gen_reg_rtx (SImode);
    2824           71 :           emit_insn (gen_zero_extendhisi2 (t1, op));
    2825           71 :           emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
    2826           71 :           op = gen_lowpart (SFmode, t1);
    2827              :         }
    2828           71 :       *pop0 = op;
    2829           71 :       op = gen_lowpart (HImode, op1);
    2830           71 :       if (CONST_INT_P (op))
    2831            6 :         op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
    2832              :                                              op1, BFmode);
    2833              :       else
    2834              :         {
    2835           65 :           rtx t1 = gen_reg_rtx (SImode);
    2836           65 :           emit_insn (gen_zero_extendhisi2 (t1, op));
    2837           65 :           emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
    2838           65 :           op = gen_lowpart (SFmode, t1);
    2839              :         }
    2840           71 :       *pop1 = op;
    2841           71 :       return ix86_prepare_fp_compare_args (code, pop0, pop1);
    2842              :     }
    2843              : 
    2844              :   /* All of the unordered compare instructions only work on registers.
    2845              :      The same is true of the fcomi compare instructions.  The XFmode
    2846              :      compare instructions require registers except when comparing
    2847              :      against zero or when converting operand 1 from fixed point to
    2848              :      floating point.  */
    2849              : 
    2850       573433 :   if (!is_sse
    2851       573433 :       && (unordered_compare
    2852         8222 :           || (op_mode == XFmode
    2853        10523 :               && ! (standard_80387_constant_p (op0) == 1
    2854         5259 :                     || standard_80387_constant_p (op1) == 1)
    2855         4918 :               && GET_CODE (op1) != FLOAT)
    2856         3304 :           || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
    2857              :     {
    2858       147672 :       op0 = force_reg (op_mode, op0);
    2859       147672 :       op1 = force_reg (op_mode, op1);
    2860              :     }
    2861              :   else
    2862              :     {
    2863              :       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
    2864              :          things around if they appear profitable, otherwise force op0
    2865              :          into a register.  */
    2866              : 
    2867       425761 :       if (standard_80387_constant_p (op0) == 0
    2868       425761 :           || (MEM_P (op0)
    2869        56544 :               && ! (standard_80387_constant_p (op1) == 0
    2870        41217 :                     || MEM_P (op1))))
    2871              :         {
    2872        37454 :           enum rtx_code new_code = ix86_fp_swap_condition (code);
    2873        37454 :           if (new_code != UNKNOWN)
    2874              :             {
    2875              :               std::swap (op0, op1);
    2876       425761 :               code = new_code;
    2877              :             }
    2878              :         }
    2879              : 
    2880       425761 :       if (!REG_P (op0))
    2881        52797 :         op0 = force_reg (op_mode, op0);
    2882              : 
    2883       425761 :       if (CONSTANT_P (op1))
    2884              :         {
    2885       193014 :           int tmp = standard_80387_constant_p (op1);
    2886       193014 :           if (tmp == 0)
    2887        73719 :             op1 = validize_mem (force_const_mem (op_mode, op1));
    2888       119295 :           else if (tmp == 1)
    2889              :             {
    2890        65207 :               if (TARGET_CMOVE)
    2891        65207 :                 op1 = force_reg (op_mode, op1);
    2892              :             }
    2893              :           else
    2894        54088 :             op1 = force_reg (op_mode, op1);
    2895              :         }
    2896              :     }
    2897              : 
    2898              :   /* Try to rearrange the comparison to make it cheaper.  */
    2899       573433 :   if (ix86_fp_comparison_cost (code)
    2900       573433 :       > ix86_fp_comparison_cost (swap_condition (code))
    2901       573433 :       && (REG_P (op1) || can_create_pseudo_p ()))
    2902              :     {
    2903            0 :       std::swap (op0, op1);
    2904            0 :       code = swap_condition (code);
    2905            0 :       if (!REG_P (op0))
    2906            0 :         op0 = force_reg (op_mode, op0);
    2907              :     }
    2908              : 
    2909       573433 :   *pop0 = op0;
    2910       573433 :   *pop1 = op1;
    2911       573433 :   return code;
    2912              : }
    2913              : 
    2914              : /* Generate insn patterns to do a floating point compare of OPERANDS.  */
    2915              : 
    2916              : static rtx
    2917       573433 : ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
    2918              : {
    2919       573433 :   bool unordered_compare = ix86_unordered_fp_compare (code);
    2920       573433 :   machine_mode cmp_mode;
    2921       573433 :   rtx tmp, scratch;
    2922              : 
    2923       573433 :   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
    2924              : 
    2925       573433 :   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
    2926       573433 :   if (unordered_compare)
    2927       498659 :     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
    2928              : 
    2929              :   /* Do fcomi/sahf based test when profitable.  */
    2930       573433 :   switch (ix86_fp_comparison_strategy (code))
    2931              :     {
    2932       573433 :     case IX86_FPCMP_COMI:
    2933       573433 :       tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
    2934              :       /* We only have vcomisbf16, No vcomubf16 nor vcomxbf16 */
    2935       573433 :       if (GET_MODE (op0) != E_BFmode)
    2936              :         {
    2937       573405 :           if (TARGET_AVX10_2 && (code == EQ || code == NE))
    2938          972 :             tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
    2939       573405 :           if (unordered_compare)
    2940       498651 :             tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
    2941              :         }
    2942       573433 :       cmp_mode = CCFPmode;
    2943       573433 :       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
    2944       573433 :       break;
    2945              : 
    2946            0 :     case IX86_FPCMP_SAHF:
    2947            0 :       cmp_mode = CCFPmode;
    2948            0 :       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
    2949            0 :       scratch = gen_reg_rtx (HImode);
    2950            0 :       emit_insn (gen_rtx_SET (scratch, tmp));
    2951            0 :       emit_insn (gen_x86_sahf_1 (scratch));
    2952            0 :       break;
    2953              : 
    2954            0 :     case IX86_FPCMP_ARITH:
    2955            0 :       cmp_mode = CCNOmode;
    2956            0 :       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
    2957            0 :       scratch = gen_reg_rtx (HImode);
    2958            0 :       emit_insn (gen_rtx_SET (scratch, tmp));
    2959              : 
    2960              :       /* In the unordered case, we have to check C2 for NaN's, which
    2961              :          doesn't happen to work out to anything nice combination-wise.
    2962              :          So do some bit twiddling on the value we've got in AH to come
    2963              :          up with an appropriate set of condition codes.  */
    2964              : 
    2965            0 :       switch (code)
    2966              :         {
    2967            0 :         case GT:
    2968            0 :         case UNGT:
    2969            0 :           if (code == GT || !TARGET_IEEE_FP)
    2970              :             {
    2971            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
    2972            0 :               code = EQ;
    2973              :             }
    2974              :           else
    2975              :             {
    2976            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    2977            0 :               emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
    2978            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
    2979            0 :               cmp_mode = CCmode;
    2980            0 :               code = GEU;
    2981              :             }
    2982              :           break;
    2983            0 :         case LT:
    2984            0 :         case UNLT:
    2985            0 :           if (code == LT && TARGET_IEEE_FP)
    2986              :             {
    2987            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    2988            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
    2989            0 :               cmp_mode = CCmode;
    2990            0 :               code = EQ;
    2991              :             }
    2992              :           else
    2993              :             {
    2994            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
    2995            0 :               code = NE;
    2996              :             }
    2997              :           break;
    2998            0 :         case GE:
    2999            0 :         case UNGE:
    3000            0 :           if (code == GE || !TARGET_IEEE_FP)
    3001              :             {
    3002            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
    3003            0 :               code = EQ;
    3004              :             }
    3005              :           else
    3006              :             {
    3007            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3008            0 :               emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
    3009            0 :               code = NE;
    3010              :             }
    3011              :           break;
    3012            0 :         case LE:
    3013            0 :         case UNLE:
    3014            0 :           if (code == LE && TARGET_IEEE_FP)
    3015              :             {
    3016            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3017            0 :               emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
    3018            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
    3019            0 :               cmp_mode = CCmode;
    3020            0 :               code = LTU;
    3021              :             }
    3022              :           else
    3023              :             {
    3024            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
    3025            0 :               code = NE;
    3026              :             }
    3027              :           break;
    3028            0 :         case EQ:
    3029            0 :         case UNEQ:
    3030            0 :           if (code == EQ && TARGET_IEEE_FP)
    3031              :             {
    3032            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3033            0 :               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
    3034            0 :               cmp_mode = CCmode;
    3035            0 :               code = EQ;
    3036              :             }
    3037              :           else
    3038              :             {
    3039            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
    3040            0 :               code = NE;
    3041              :             }
    3042              :           break;
    3043            0 :         case NE:
    3044            0 :         case LTGT:
    3045            0 :           if (code == NE && TARGET_IEEE_FP)
    3046              :             {
    3047            0 :               emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
    3048            0 :               emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
    3049              :                                              GEN_INT (0x40)));
    3050            0 :               code = NE;
    3051              :             }
    3052              :           else
    3053              :             {
    3054            0 :               emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
    3055            0 :               code = EQ;
    3056              :             }
    3057              :           break;
    3058              : 
    3059            0 :         case UNORDERED:
    3060            0 :           emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
    3061            0 :           code = NE;
    3062            0 :           break;
    3063            0 :         case ORDERED:
    3064            0 :           emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
    3065            0 :           code = EQ;
    3066            0 :           break;
    3067              : 
    3068            0 :         default:
    3069            0 :           gcc_unreachable ();
    3070              :         }
    3071              :         break;
    3072              : 
    3073            0 :     default:
    3074            0 :       gcc_unreachable();
    3075              :     }
    3076              : 
    3077              :   /* Return the test that should be put into the flags user, i.e.
    3078              :      the bcc, scc, or cmov instruction.  */
    3079       573433 :   return gen_rtx_fmt_ee (code, VOIDmode,
    3080              :                          gen_rtx_REG (cmp_mode, FLAGS_REG),
    3081              :                          const0_rtx);
    3082              : }
    3083              : 
    3084              : /* Generate insn patterns to do an integer compare of OPERANDS.  */
    3085              : 
    3086              : static rtx
    3087      6983930 : ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
    3088              : {
    3089      6983930 :   machine_mode cmpmode;
    3090      6983930 :   rtx tmp, flags;
    3091              : 
    3092              :   /* Swap operands to emit carry flag comparison.  */
    3093      6983930 :   if ((code == GTU || code == LEU)
    3094      6983930 :       && nonimmediate_operand (op1, VOIDmode))
    3095              :     {
    3096       144928 :       std::swap (op0, op1);
    3097       144928 :       code = swap_condition (code);
    3098              :     }
    3099              : 
    3100      6983930 :   cmpmode = SELECT_CC_MODE (code, op0, op1);
    3101      6983930 :   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
    3102              : 
    3103              :   /* Attempt to use PTEST, if available, when testing vector modes for
    3104              :      equality/inequality against zero.  */
    3105      6983930 :   if (op1 == const0_rtx
    3106      2908159 :       && SUBREG_P (op0)
    3107        22842 :       && cmpmode == CCZmode
    3108        10269 :       && SUBREG_BYTE (op0) == 0
    3109         8634 :       && REG_P (SUBREG_REG (op0))
    3110         8634 :       && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
    3111            7 :       && TARGET_SSE4_1
    3112            1 :       && GET_MODE (op0) == TImode
    3113      6983932 :       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
    3114              :     {
    3115            1 :       tmp = SUBREG_REG (op0);
    3116            1 :       if (GET_MODE (tmp) == V8HFmode || GET_MODE (tmp) == V8BFmode)
    3117            1 :         tmp = gen_lowpart (V8HImode, tmp);
    3118            1 :       tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
    3119              :     }
    3120              :   else
    3121      6983929 :     tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
    3122              : 
    3123              :   /* This is very simple, but making the interface the same as in the
    3124              :      FP case makes the rest of the code easier.  */
    3125      6983930 :   emit_insn (gen_rtx_SET (flags, tmp));
    3126              : 
    3127              :   /* Return the test that should be put into the flags user, i.e.
    3128              :      the bcc, scc, or cmov instruction.  */
    3129      6983930 :   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
    3130              : }
    3131              : 
    3132              : static rtx
    3133      7688388 : ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
    3134              : {
    3135      7688388 :   rtx ret;
    3136              : 
    3137      7688388 :   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
    3138       133111 :     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
    3139              : 
    3140      7555277 :   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
    3141              :     {
    3142       571347 :       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
    3143       571347 :       ret = ix86_expand_fp_compare (code, op0, op1);
    3144              :     }
    3145              :   else
    3146      6983930 :     ret = ix86_expand_int_compare (code, op0, op1);
    3147              : 
    3148      7688388 :   return ret;
    3149              : }
    3150              : 
    3151              : void
    3152       586372 : ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
    3153              : {
    3154       586372 :   rtx ret;
    3155              : 
    3156       586372 :   gcc_assert (GET_MODE (dest) == QImode);
    3157              : 
    3158       586372 :   ret = ix86_expand_compare (code, op0, op1);
    3159       586372 :   PUT_MODE (ret, QImode);
    3160       586372 :   emit_insn (gen_rtx_SET (dest, ret));
    3161       586372 : }
    3162              : 
    3163              : /* Expand floating point op0 <=> op1, i.e.
    3164              :    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128.  */
    3165              : 
    3166              : void
    3167          244 : ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
    3168              : {
    3169          244 :   gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
    3170          244 :   rtx zero = NULL_RTX;
    3171          244 :   if (op2 != const0_rtx
    3172           52 :       && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND)
    3173           34 :       && GET_MODE (dest) == SImode)
    3174           34 :     zero = force_reg (SImode, const0_rtx);
    3175          244 :   rtx gt = ix86_expand_fp_compare (GT, op0, op1);
    3176          244 :   rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
    3177          244 :   rtx l1 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
    3178          244 :   rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
    3179          244 :   rtx lend = gen_label_rtx ();
    3180          244 :   rtx tmp;
    3181          244 :   rtx_insn *jmp;
    3182          244 :   if (l2)
    3183              :     {
    3184          207 :       rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
    3185              :                                gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
    3186          207 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
    3187              :                                   gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
    3188          207 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3189          207 :       add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
    3190              :     }
    3191          244 :   if (op2 == const0_rtx)
    3192              :     {
    3193          192 :       rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
    3194              :                                gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
    3195          192 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
    3196              :                                   gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
    3197          192 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3198          192 :       add_reg_br_prob_note (jmp, profile_probability::unlikely ());
    3199          192 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
    3200              :                                   gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
    3201          192 :       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
    3202          192 :       add_reg_br_prob_note (jmp, profile_probability::even ());
    3203          192 :       emit_move_insn (dest, constm1_rtx);
    3204          192 :       emit_jump (lend);
    3205          192 :       emit_label (l0);
    3206          192 :       emit_move_insn (dest, const0_rtx);
    3207          192 :       emit_jump (lend);
    3208          192 :       emit_label (l1);
    3209          192 :       emit_move_insn (dest, const1_rtx);
    3210              :     }
    3211              :   else
    3212              :     {
    3213           52 :       rtx lt_tmp = NULL_RTX;
    3214           52 :       if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND)
    3215              :         {
    3216           52 :           lt_tmp = gen_reg_rtx (QImode);
    3217           52 :           ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
    3218              :                              const0_rtx);
    3219           52 :           if (GET_MODE (dest) != QImode)
    3220              :             {
    3221           52 :               tmp = gen_reg_rtx (GET_MODE (dest));
    3222           52 :               emit_insn (gen_rtx_SET (tmp,
    3223              :                                       gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3224              :                                                            lt_tmp)));
    3225           52 :               lt_tmp = tmp;
    3226              :             }
    3227              :         }
    3228           52 :       rtx gt_tmp;
    3229           52 :       if (zero)
    3230              :         {
    3231              :           /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
    3232              :              before the floating point comparison and use setcc_si_slp
    3233              :              pattern to hide it from the combiner, so that it doesn't
    3234              :              undo it.  Similarly for TARGET_ZERO_EXTEND_WITH_AND, where
    3235              :              the ZERO_EXTEND normally emitted would need to be AND
    3236              :              with flags clobber.  */
    3237           34 :           tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
    3238           34 :           PUT_MODE (tmp, QImode);
    3239           34 :           emit_insn (gen_setcc_si_slp (zero, tmp, zero));
    3240           34 :           gt_tmp = zero;
    3241              :         }
    3242              :       else
    3243              :         {
    3244           18 :           gt_tmp = gen_reg_rtx (QImode);
    3245           18 :           ix86_expand_setcc (gt_tmp, GT, XEXP (gt, 0), const0_rtx);
    3246           18 :           if (GET_MODE (dest) != QImode)
    3247              :             {
    3248           18 :               tmp = gen_reg_rtx (GET_MODE (dest));
    3249           18 :               emit_insn (gen_rtx_SET (tmp,
    3250              :                                       gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3251              :                                                            gt_tmp)));
    3252           18 :               gt_tmp = tmp;
    3253              :             }
    3254              :         }
    3255           52 :       if (lt_tmp)
    3256              :         {
    3257           52 :           tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp,
    3258              :                                      dest, 0, OPTAB_DIRECT);
    3259           52 :           if (!rtx_equal_p (tmp, dest))
    3260            0 :             emit_move_insn (dest, tmp);
    3261              :         }
    3262              :       else
    3263              :         {
    3264              :           /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
    3265              :              do ZERO_EXTEND without clobbering flags.  */
    3266            0 :           tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx);
    3267            0 :           PUT_MODE (tmp, SImode);
    3268            0 :           emit_insn (gen_subsi3_carry (dest, gt_tmp,
    3269            0 :                                        force_reg (GET_MODE (dest), const0_rtx),
    3270              :                                        XEXP (gt, 0), tmp));
    3271              :         }
    3272              :     }
    3273          244 :   emit_jump (lend);
    3274          244 :   if (l2)
    3275              :     {
    3276          207 :       emit_label (l2);
    3277          207 :       emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
    3278              :     }
    3279          244 :   emit_label (lend);
    3280          244 : }
    3281              : 
    3282              : /* Expand integral op0 <=> op1, i.e.
    3283              :    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : 1.  */
    3284              : 
    3285              : void
    3286           35 : ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
    3287              : {
    3288           35 :   gcc_assert (INTVAL (op2));
    3289           35 :   rtx zero1 = NULL_RTX, zero2 = NULL_RTX;
    3290           35 :   if (TARGET_ZERO_EXTEND_WITH_AND && GET_MODE (dest) == SImode)
    3291              :     {
    3292            0 :       zero1 = force_reg (SImode, const0_rtx);
    3293            0 :       if (INTVAL (op2) != 1)
    3294            0 :         zero2 = force_reg (SImode, const0_rtx);
    3295              :     }
    3296              : 
    3297              :   /* Not using ix86_expand_int_compare here, so that it doesn't swap
    3298              :      operands nor optimize CC mode - we need a mode usable for both
    3299              :      LT and GT resp. LTU and GTU comparisons with the same unswapped
    3300              :      operands.  */
    3301           51 :   rtx flags = gen_rtx_REG (INTVAL (op2) != 1 ? CCGCmode : CCmode, FLAGS_REG);
    3302           35 :   rtx tmp = gen_rtx_COMPARE (GET_MODE (flags), op0, op1);
    3303           35 :   emit_insn (gen_rtx_SET (flags, tmp));
    3304           35 :   rtx lt_tmp = NULL_RTX;
    3305           35 :   if (zero2)
    3306              :     {
    3307              :       /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
    3308              :          ZERO_EXTEND.  */
    3309            0 :       tmp = ix86_expand_compare (LT, flags, const0_rtx);
    3310            0 :       PUT_MODE (tmp, QImode);
    3311            0 :       emit_insn (gen_setcc_si_slp (zero2, tmp, zero2));
    3312            0 :       lt_tmp = zero2;
    3313              :     }
    3314           35 :   else if (!zero1)
    3315              :     {
    3316           35 :       lt_tmp = gen_reg_rtx (QImode);
    3317           51 :       ix86_expand_setcc (lt_tmp, INTVAL (op2) != 1 ? LT : LTU, flags,
    3318              :                          const0_rtx);
    3319           35 :       if (GET_MODE (dest) != QImode)
    3320              :         {
    3321           35 :           tmp = gen_reg_rtx (GET_MODE (dest));
    3322           35 :           emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3323              :                                                             lt_tmp)));
    3324           35 :           lt_tmp = tmp;
    3325              :         }
    3326              :     }
    3327           35 :   rtx gt_tmp;
    3328           35 :   if (zero1)
    3329              :     {
    3330              :       /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
    3331              :          ZERO_EXTEND.  */
    3332            0 :       tmp = ix86_expand_compare (INTVAL (op2) != 1 ? GT : GTU, flags,
    3333              :                                  const0_rtx);
    3334            0 :       PUT_MODE (tmp, QImode);
    3335            0 :       emit_insn (gen_setcc_si_slp (zero1, tmp, zero1));
    3336            0 :       gt_tmp = zero1;
    3337              :     }
    3338              :   else
    3339              :     {
    3340           35 :       gt_tmp = gen_reg_rtx (QImode);
    3341           51 :       ix86_expand_setcc (gt_tmp, INTVAL (op2) != 1 ? GT : GTU, flags,
    3342              :                          const0_rtx);
    3343           35 :       if (GET_MODE (dest) != QImode)
    3344              :         {
    3345           35 :           tmp = gen_reg_rtx (GET_MODE (dest));
    3346           35 :           emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
    3347              :                                                             gt_tmp)));
    3348           35 :           gt_tmp = tmp;
    3349              :         }
    3350              :     }
    3351           35 :   if (lt_tmp)
    3352              :     {
    3353           35 :       tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
    3354              :                                  0, OPTAB_DIRECT);
    3355           35 :       if (!rtx_equal_p (tmp, dest))
    3356            0 :         emit_move_insn (dest, tmp);
    3357              :     }
    3358              :   else
    3359              :     {
    3360              :       /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
    3361              :          do ZERO_EXTEND without clobbering flags.  */
    3362            0 :       tmp = ix86_expand_compare (LTU, flags, const0_rtx);
    3363            0 :       PUT_MODE (tmp, SImode);
    3364            0 :       emit_insn (gen_subsi3_carry (dest, gt_tmp,
    3365            0 :                                    force_reg (GET_MODE (dest), const0_rtx),
    3366              :                                    flags, tmp));
    3367              :     }
    3368           35 : }
    3369              : 
    3370              : /* Expand comparison setting or clearing carry flag.  Return true when
    3371              :    successful and set pop for the operation.  */
    3372              : static bool
    3373        33919 : ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
    3374              : {
    3375        67838 :   machine_mode mode
    3376        33919 :     = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
    3377              : 
    3378              :   /* Do not handle double-mode compares that go through special path.  */
    3379        36154 :   if (mode == (TARGET_64BIT ? TImode : DImode))
    3380              :     return false;
    3381              : 
    3382        33909 :   if (SCALAR_FLOAT_MODE_P (mode))
    3383              :     {
    3384         1844 :       rtx compare_op;
    3385         1844 :       rtx_insn *compare_seq;
    3386              : 
    3387         1844 :       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
    3388              : 
    3389              :       /* Shortcut:  following common codes never translate
    3390              :          into carry flag compares.  */
    3391         1844 :       if (code == EQ || code == NE || code == UNEQ || code == LTGT
    3392              :           || code == ORDERED || code == UNORDERED)
    3393              :         return false;
    3394              : 
    3395              :       /* These comparisons require zero flag; swap operands so they won't.  */
    3396              :       if ((code == GT || code == UNLE || code == LE || code == UNGT)
    3397         1779 :           && !TARGET_IEEE_FP)
    3398              :         {
    3399            2 :           std::swap (op0, op1);
    3400            2 :           code = swap_condition (code);
    3401              :         }
    3402              : 
    3403              :       /* Try to expand the comparison and verify that we end up with
    3404              :          carry flag based comparison.  This fails to be true only when
    3405              :          we decide to expand comparison using arithmetic that is not
    3406              :          too common scenario.  */
    3407         1842 :       start_sequence ();
    3408         1842 :       compare_op = ix86_expand_fp_compare (code, op0, op1);
    3409         1842 :       compare_seq = end_sequence ();
    3410              : 
    3411         1842 :       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
    3412         1842 :         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
    3413              :       else
    3414            0 :         code = GET_CODE (compare_op);
    3415              : 
    3416         1842 :       if (code != LTU && code != GEU)
    3417              :         return false;
    3418              : 
    3419           63 :       emit_insn (compare_seq);
    3420           63 :       *pop = compare_op;
    3421           63 :       return true;
    3422              :     }
    3423              : 
    3424        32065 :   if (!INTEGRAL_MODE_P (mode))
    3425              :     return false;
    3426              : 
    3427        31929 :   switch (code)
    3428              :     {
    3429              :     case LTU:
    3430              :     case GEU:
    3431              :       break;
    3432              : 
    3433              :     /* Convert a==0 into (unsigned)a<1.  */
    3434        28274 :     case EQ:
    3435        28274 :     case NE:
    3436        28274 :       if (op1 != const0_rtx)
    3437              :         return false;
    3438        10129 :       op1 = const1_rtx;
    3439        10129 :       code = (code == EQ ? LTU : GEU);
    3440              :       break;
    3441              : 
    3442              :     /* Convert a>b into b<a or a>=b-1.  */
    3443          827 :     case GTU:
    3444          827 :     case LEU:
    3445          827 :       if (CONST_INT_P (op1))
    3446              :         {
    3447          785 :           op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
    3448              :           /* Bail out on overflow.  We still can swap operands but that
    3449              :              would force loading of the constant into register.  */
    3450          785 :           if (op1 == const0_rtx
    3451          785 :               || !x86_64_immediate_operand (op1, GET_MODE (op1)))
    3452            0 :             return false;
    3453          785 :           code = (code == GTU ? GEU : LTU);
    3454              :         }
    3455              :       else
    3456              :         {
    3457           42 :           std::swap (op0, op1);
    3458           42 :           code = (code == GTU ? LTU : GEU);
    3459              :         }
    3460              :       break;
    3461              : 
    3462              :     /* Convert a>=0 into (unsigned)a<0x80000000.  */
    3463         1300 :     case LT:
    3464         1300 :     case GE:
    3465         1300 :       if (mode == DImode || op1 != const0_rtx)
    3466              :         return false;
    3467          204 :       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
    3468          102 :       code = (code == LT ? GEU : LTU);
    3469              :       break;
    3470          833 :     case LE:
    3471          833 :     case GT:
    3472          833 :       if (mode == DImode || op1 != constm1_rtx)
    3473              :         return false;
    3474            0 :       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
    3475            0 :       code = (code == LE ? GEU : LTU);
    3476              :       break;
    3477              : 
    3478              :     default:
    3479              :       return false;
    3480              :     }
    3481              :   /* Swapping operands may cause constant to appear as first operand.  */
    3482        11753 :   if (!nonimmediate_operand (op0, VOIDmode))
    3483              :     {
    3484            0 :       if (!can_create_pseudo_p ())
    3485              :         return false;
    3486            0 :       op0 = force_reg (mode, op0);
    3487              :     }
    3488        11753 :   *pop = ix86_expand_compare (code, op0, op1);
    3489        11753 :   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
    3490              :   return true;
    3491              : }
    3492              : 
    3493              : /* Expand conditional increment or decrement using adb/sbb instructions.
    3494              :    The default case using setcc followed by the conditional move can be
    3495              :    done by generic code.  */
    3496              : bool
    3497         6816 : ix86_expand_int_addcc (rtx operands[])
    3498              : {
    3499         6816 :   enum rtx_code code = GET_CODE (operands[1]);
    3500         6816 :   rtx flags;
    3501         6816 :   rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
    3502         6816 :   rtx compare_op;
    3503         6816 :   rtx val = const0_rtx;
    3504         6816 :   bool fpcmp = false;
    3505         6816 :   machine_mode mode;
    3506         6816 :   rtx op0 = XEXP (operands[1], 0);
    3507         6816 :   rtx op1 = XEXP (operands[1], 1);
    3508              : 
    3509         6816 :   if (operands[3] != const1_rtx
    3510         2809 :       && operands[3] != constm1_rtx)
    3511              :     return false;
    3512         4728 :   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
    3513              :      return false;
    3514         1274 :   code = GET_CODE (compare_op);
    3515              : 
    3516         1274 :   flags = XEXP (compare_op, 0);
    3517              : 
    3518         1274 :   if (GET_MODE (flags) == CCFPmode)
    3519              :     {
    3520            4 :       fpcmp = true;
    3521            4 :       code = ix86_fp_compare_code_to_integer (code);
    3522              :     }
    3523              : 
    3524         1274 :   if (code != LTU)
    3525              :     {
    3526          733 :       val = constm1_rtx;
    3527          733 :       if (fpcmp)
    3528            4 :         PUT_CODE (compare_op,
    3529              :                   reverse_condition_maybe_unordered
    3530              :                     (GET_CODE (compare_op)));
    3531              :       else
    3532          729 :         PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
    3533              :     }
    3534              : 
    3535         1274 :   mode = GET_MODE (operands[0]);
    3536              : 
    3537              :   /* Construct either adc or sbb insn.  */
    3538         1274 :   if ((code == LTU) == (operands[3] == constm1_rtx))
    3539              :     insn = gen_sub3_carry;
    3540              :   else
    3541          521 :     insn = gen_add3_carry;
    3542              : 
    3543         1274 :   emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
    3544              : 
    3545         1274 :   return true;
    3546              : }
    3547              : 
    3548              : bool
    3549       438315 : ix86_expand_int_movcc (rtx operands[])
    3550              : {
    3551       438315 :   enum rtx_code code = GET_CODE (operands[1]), compare_code;
    3552       438315 :   rtx_insn *compare_seq;
    3553       438315 :   rtx compare_op;
    3554       438315 :   machine_mode mode = GET_MODE (operands[0]);
    3555       438315 :   bool sign_bit_compare_p = false;
    3556       438315 :   bool negate_cc_compare_p = false;
    3557       438315 :   rtx op0 = XEXP (operands[1], 0);
    3558       438315 :   rtx op1 = XEXP (operands[1], 1);
    3559       438315 :   rtx op2 = operands[2];
    3560       438315 :   rtx op3 = operands[3];
    3561              : 
    3562       438315 :   if (GET_MODE (op0) == TImode
    3563       422897 :       || (GET_MODE (op0) == DImode
    3564       102538 :           && !TARGET_64BIT))
    3565              :     return false;
    3566              : 
    3567       421801 :   if (GET_MODE (op0) == BFmode
    3568       421801 :       && !ix86_fp_comparison_operator (operands[1], VOIDmode))
    3569              :     return false;
    3570              : 
    3571       421801 :   start_sequence ();
    3572       421801 :   compare_op = ix86_expand_compare (code, op0, op1);
    3573       421801 :   compare_seq = end_sequence ();
    3574              : 
    3575       421801 :   compare_code = GET_CODE (compare_op);
    3576              : 
    3577       421801 :   if ((op1 == const0_rtx && (code == GE || code == LT))
    3578       379820 :       || (op1 == constm1_rtx && (code == GT || code == LE)))
    3579              :     sign_bit_compare_p = true;
    3580              : 
    3581              :   /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
    3582              :      but if op1 is a constant, the latter form allows more optimizations,
    3583              :      either through the last 2 ops being constant handling, or the one
    3584              :      constant and one variable cases.  On the other side, for cmov the
    3585              :      former might be better as we don't need to load the constant into
    3586              :      another register.  */
    3587       379820 :   if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
    3588              :     op2 = op1;
    3589              :   /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1.  */
    3590       421287 :   else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
    3591              :     op3 = op1;
    3592              : 
    3593              :   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
    3594              :      HImode insns, we'd be swallowed in word prefix ops.  */
    3595              : 
    3596         4882 :   if ((mode != HImode || TARGET_FAST_PREFIX)
    3597       451124 :       && (mode != (TARGET_64BIT ? TImode : DImode))
    3598       421801 :       && CONST_INT_P (op2)
    3599       458743 :       && CONST_INT_P (op3))
    3600              :     {
    3601        29973 :       rtx out = operands[0];
    3602        29973 :       HOST_WIDE_INT ct = INTVAL (op2);
    3603        29973 :       HOST_WIDE_INT cf = INTVAL (op3);
    3604        29973 :       HOST_WIDE_INT diff;
    3605              : 
    3606        29973 :       if ((mode == SImode
    3607        16375 :            || (TARGET_64BIT && mode == DImode))
    3608        18435 :           && (GET_MODE (op0) == SImode
    3609        14370 :               || (TARGET_64BIT && GET_MODE (op0) == DImode)))
    3610              :         {
    3611              :           /* Special case x != 0 ? -1 : y.  */
    3612        13176 :           if (code == NE && op1 == const0_rtx && ct == -1)
    3613              :             {
    3614              :               negate_cc_compare_p = true;
    3615              :               std::swap (ct, cf);
    3616              :               code = EQ;
    3617              :             }
    3618        13077 :           else if (code == EQ && op1 == const0_rtx && cf == -1)
    3619        29973 :             negate_cc_compare_p = true;
    3620              :         }
    3621              : 
    3622        29973 :       diff = (unsigned HOST_WIDE_INT) ct - cf;
    3623              :       /* Make sure we can represent the difference between the two values.  */
    3624        29973 :       if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3625       438315 :         return false;
    3626              : 
    3627              :       /*  Sign bit compares are better done using shifts than we do by using
    3628              :           sbb.  */
    3629        29825 :       if (sign_bit_compare_p
    3630        29825 :           || negate_cc_compare_p
    3631        29825 :           || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
    3632              :         {
    3633              :           /* Detect overlap between destination and compare sources.  */
    3634        11176 :           rtx tmp = out;
    3635              : 
    3636        11176 :           if (negate_cc_compare_p)
    3637              :             {
    3638          280 :               if (GET_MODE (op0) == DImode)
    3639          106 :                 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
    3640              :               else
    3641          174 :                 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
    3642          174 :                                               gen_lowpart (SImode, op0)));
    3643              : 
    3644          280 :               tmp = gen_reg_rtx (mode);
    3645          280 :               if (mode == DImode)
    3646          123 :                 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
    3647              :               else
    3648          157 :                 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
    3649              :                                                                   tmp)));
    3650              :             }
    3651        10896 :           else if (!sign_bit_compare_p)
    3652              :             {
    3653        10542 :               rtx flags;
    3654        10542 :               bool fpcmp = false;
    3655              : 
    3656        10542 :               compare_code = GET_CODE (compare_op);
    3657              : 
    3658        10542 :               flags = XEXP (compare_op, 0);
    3659              : 
    3660        10542 :               if (GET_MODE (flags) == CCFPmode)
    3661              :                 {
    3662           59 :                   fpcmp = true;
    3663           59 :                   compare_code
    3664           59 :                     = ix86_fp_compare_code_to_integer (compare_code);
    3665              :                 }
    3666              : 
    3667              :               /* To simplify rest of code, restrict to the GEU case.  */
    3668        10542 :               if (compare_code == LTU)
    3669              :                 {
    3670         6001 :                   std::swap (ct, cf);
    3671         6001 :                   compare_code = reverse_condition (compare_code);
    3672         6001 :                   code = reverse_condition (code);
    3673              :                 }
    3674              :               else
    3675              :                 {
    3676         4541 :                   if (fpcmp)
    3677           59 :                     PUT_CODE (compare_op,
    3678              :                               reverse_condition_maybe_unordered
    3679              :                                 (GET_CODE (compare_op)));
    3680              :                   else
    3681         4482 :                     PUT_CODE (compare_op,
    3682              :                               reverse_condition (GET_CODE (compare_op)));
    3683              :                 }
    3684              : 
    3685        10542 :               diff = (unsigned HOST_WIDE_INT) ct - cf;
    3686              :               /* Make sure we can represent the difference
    3687              :                  between the two values.  */
    3688        10542 :               if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3689              :                 return false;
    3690              : 
    3691        10541 :               if (reg_overlap_mentioned_p (out, compare_op))
    3692            0 :                 tmp = gen_reg_rtx (mode);
    3693              : 
    3694        10541 :               if (mode == DImode)
    3695         2182 :                 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
    3696              :               else
    3697         8359 :                 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
    3698              :                                                  flags, compare_op));
    3699              :             }
    3700              :           else
    3701              :             {
    3702          354 :               if (code == GT || code == GE)
    3703          153 :                 code = reverse_condition (code);
    3704              :               else
    3705              :                 {
    3706          201 :                   std::swap (ct, cf);
    3707              : 
    3708          201 :                   diff = (unsigned HOST_WIDE_INT) ct - cf;
    3709              :                   /* Make sure we can represent the difference
    3710              :                      between the two values.  */
    3711          201 :                   if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3712              :                     return false;
    3713              :                 }
    3714          349 :               tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
    3715              :             }
    3716              : 
    3717        11170 :           if (diff == 1)
    3718              :             {
    3719              :               /*
    3720              :                * cmpl op0,op1
    3721              :                * sbbl dest,dest
    3722              :                * [addl dest, ct]
    3723              :                *
    3724              :                * Size 5 - 8.
    3725              :                */
    3726         1138 :               if (ct)
    3727          965 :                 tmp = expand_simple_binop (mode, PLUS,
    3728              :                                            tmp, GEN_INT (ct),
    3729              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3730              :             }
    3731        10032 :           else if (cf == -1)
    3732              :             {
    3733              :               /*
    3734              :                * cmpl op0,op1
    3735              :                * sbbl dest,dest
    3736              :                * orl $ct, dest
    3737              :                *
    3738              :                * Size 8.
    3739              :                */
    3740          595 :               tmp = expand_simple_binop (mode, IOR,
    3741              :                                          tmp, GEN_INT (ct),
    3742              :                                          copy_rtx (tmp), 1, OPTAB_DIRECT);
    3743              :             }
    3744         9437 :           else if (diff == -1 && ct)
    3745              :             {
    3746              :               /*
    3747              :                * cmpl op0,op1
    3748              :                * sbbl dest,dest
    3749              :                * notl dest
    3750              :                * [addl dest, cf]
    3751              :                *
    3752              :                * Size 8 - 11.
    3753              :                */
    3754          687 :               tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
    3755          687 :               if (cf)
    3756          669 :                 tmp = expand_simple_binop (mode, PLUS,
    3757              :                                            copy_rtx (tmp), GEN_INT (cf),
    3758              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3759              :             }
    3760              :           else
    3761              :             {
    3762              :               /*
    3763              :                * cmpl op0,op1
    3764              :                * sbbl dest,dest
    3765              :                * [notl dest]
    3766              :                * andl cf - ct, dest
    3767              :                * [addl dest, ct]
    3768              :                *
    3769              :                * Size 8 - 11.
    3770              :                */
    3771              : 
    3772         8750 :               if (cf == 0)
    3773              :                 {
    3774          895 :                   cf = ct;
    3775          895 :                   ct = 0;
    3776          895 :                   tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
    3777              :                 }
    3778              : 
    3779         8750 :               HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
    3780              :               /* Make sure we can represent the difference
    3781              :                  between the two values.  */
    3782         8750 :               if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
    3783        16668 :                 return false;
    3784              : 
    3785         8750 :               tmp = expand_simple_binop (mode, AND,
    3786              :                                          copy_rtx (tmp),
    3787         8750 :                                          gen_int_mode (ival, mode),
    3788              :                                          copy_rtx (tmp), 1, OPTAB_DIRECT);
    3789         8750 :               if (ct)
    3790         7072 :                 tmp = expand_simple_binop (mode, PLUS,
    3791              :                                            copy_rtx (tmp), GEN_INT (ct),
    3792              :                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
    3793              :             }
    3794              : 
    3795        11170 :           if (!rtx_equal_p (tmp, out))
    3796          470 :             emit_move_insn (copy_rtx (out), copy_rtx (tmp));
    3797              : 
    3798        11170 :           return true;
    3799              :         }
    3800              : 
    3801        18649 :       if (diff < 0)
    3802              :         {
    3803         8848 :           machine_mode cmp_mode = GET_MODE (op0);
    3804         8848 :           enum rtx_code new_code;
    3805              : 
    3806         8848 :           if (SCALAR_FLOAT_MODE_P (cmp_mode))
    3807              :             {
    3808           54 :               gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
    3809              : 
    3810              :               /* We may be reversing a non-trapping
    3811              :                  comparison to a trapping comparison.  */
    3812          104 :                   if (HONOR_NANS (cmp_mode) && flag_trapping_math
    3813           41 :                       && code != EQ && code != NE
    3814           95 :                       && code != ORDERED && code != UNORDERED)
    3815              :                     new_code = UNKNOWN;
    3816              :                   else
    3817           13 :                     new_code = reverse_condition_maybe_unordered (code);
    3818              :             }
    3819              :           else
    3820         8794 :             new_code = ix86_reverse_condition (code, cmp_mode);
    3821         8807 :           if (new_code != UNKNOWN)
    3822              :             {
    3823         8807 :               std::swap (ct, cf);
    3824              : 
    3825         8807 :               diff = (unsigned HOST_WIDE_INT) ct - cf;
    3826              :               /* Make sure we can represent the difference
    3827              :                  between the two values.  */
    3828         8807 :               if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
    3829              :                 return false;
    3830              : 
    3831              :               code = new_code;
    3832              :             }
    3833              :         }
    3834              : 
    3835        18649 :       compare_code = UNKNOWN;
    3836        18649 :       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
    3837        16878 :           && CONST_INT_P (op1))
    3838              :         {
    3839        11029 :           if (op1 == const0_rtx
    3840          214 :               && (code == LT || code == GE))
    3841              :             compare_code = code;
    3842        11029 :           else if (op1 == constm1_rtx)
    3843              :             {
    3844          295 :               if (code == LE)
    3845              :                 compare_code = LT;
    3846          295 :               else if (code == GT)
    3847              :                 compare_code = GE;
    3848              :             }
    3849              :         }
    3850              : 
    3851              :       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
    3852              :       if (compare_code != UNKNOWN
    3853            0 :           && GET_MODE (op0) == GET_MODE (out)
    3854            0 :           && (cf == -1 || ct == -1))
    3855              :         {
    3856              :           /* If lea code below could be used, only optimize
    3857              :              if it results in a 2 insn sequence.  */
    3858              : 
    3859            0 :           if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
    3860            0 :                  || diff == 3 || diff == 5 || diff == 9)
    3861            0 :               || (compare_code == LT && ct == -1)
    3862            0 :               || (compare_code == GE && cf == -1))
    3863              :             {
    3864              :               /*
    3865              :                * notl op1       (if necessary)
    3866              :                * sarl $31, op1
    3867              :                * orl cf, op1
    3868              :                */
    3869            0 :               if (ct != -1)
    3870              :                 {
    3871            0 :                   cf = ct;
    3872            0 :                   ct = -1;
    3873            0 :                   code = reverse_condition (code);
    3874              :                 }
    3875              : 
    3876            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
    3877              : 
    3878            0 :               out = expand_simple_binop (mode, IOR,
    3879              :                                          out, GEN_INT (cf),
    3880              :                                          out, 1, OPTAB_DIRECT);
    3881            0 :               if (out != operands[0])
    3882            0 :                 emit_move_insn (operands[0], out);
    3883              : 
    3884            0 :               return true;
    3885              :             }
    3886              :         }
    3887              : 
    3888              : 
    3889        29746 :       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
    3890        11097 :            || diff == 3 || diff == 5 || diff == 9)
    3891         7895 :           && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
    3892        26544 :           && (mode != DImode
    3893         1922 :               || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
    3894              :         {
    3895              :           /*
    3896              :            * xorl dest,dest
    3897              :            * cmpl op1,op2
    3898              :            * setcc dest
    3899              :            * lea cf(dest*(ct-cf)),dest
    3900              :            *
    3901              :            * Size 14.
    3902              :            *
    3903              :            * This also catches the degenerate setcc-only case.
    3904              :            */
    3905              : 
    3906         7895 :           rtx tmp;
    3907         7895 :           int nops;
    3908              : 
    3909         7895 :           out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
    3910              : 
    3911         7895 :           nops = 0;
    3912              :           /* On x86_64 the lea instruction operates on Pmode, so we need
    3913              :              to get arithmetics done in proper mode to match.  */
    3914         7895 :           if (diff == 1)
    3915         6671 :             tmp = copy_rtx (out);
    3916              :           else
    3917              :             {
    3918         1224 :               rtx out1;
    3919         1224 :               out1 = copy_rtx (out);
    3920         1224 :               tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
    3921         1224 :               nops++;
    3922         1224 :               if (diff & 1)
    3923              :                 {
    3924          254 :                   tmp = gen_rtx_PLUS (mode, tmp, out1);
    3925          254 :                   nops++;
    3926              :                 }
    3927              :             }
    3928         7895 :           if (cf != 0)
    3929              :             {
    3930         6925 :               tmp = plus_constant (mode, tmp, cf);
    3931         6925 :               nops++;
    3932              :             }
    3933         7895 :           if (!rtx_equal_p (tmp, out))
    3934              :             {
    3935         7165 :               if (nops == 1)
    3936         6039 :                 out = force_operand (tmp, copy_rtx (out));
    3937              :               else
    3938         1126 :                 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
    3939              :             }
    3940         7895 :           if (!rtx_equal_p (out, operands[0]))
    3941          894 :             emit_move_insn (operands[0], copy_rtx (out));
    3942              : 
    3943         7895 :           return true;
    3944              :         }
    3945              : 
    3946              :       /*
    3947              :        * General case:                  Jumpful:
    3948              :        *   xorl dest,dest               cmpl op1, op2
    3949              :        *   cmpl op1, op2                movl ct, dest
    3950              :        *   setcc dest                   jcc 1f
    3951              :        *   decl dest                    movl cf, dest
    3952              :        *   andl (cf-ct),dest            1:
    3953              :        *   addl ct,dest
    3954              :        *
    3955              :        * Size 20.                       Size 14.
    3956              :        *
    3957              :        * This is reasonably steep, but branch mispredict costs are
    3958              :        * high on modern cpus, so consider failing only if optimizing
    3959              :        * for space.
    3960              :        */
    3961              : 
    3962        10754 :       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
    3963        10754 :           && BRANCH_COST (optimize_insn_for_speed_p (),
    3964              :                           false) >= 2)
    3965              :         {
    3966            0 :           if (cf == 0)
    3967              :             {
    3968            0 :               machine_mode cmp_mode = GET_MODE (op0);
    3969            0 :               enum rtx_code new_code;
    3970              : 
    3971            0 :               if (SCALAR_FLOAT_MODE_P (cmp_mode))
    3972              :                 {
    3973            0 :                   gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
    3974              : 
    3975              :                   /* We may be reversing a non-trapping
    3976              :                      comparison to a trapping comparison.  */
    3977            0 :                   if (HONOR_NANS (cmp_mode) && flag_trapping_math
    3978            0 :                       && code != EQ && code != NE
    3979            0 :                       && code != ORDERED && code != UNORDERED)
    3980              :                     new_code = UNKNOWN;
    3981              :                   else
    3982            0 :                     new_code = reverse_condition_maybe_unordered (code);
    3983              : 
    3984              :                 }
    3985              :               else
    3986              :                 {
    3987            0 :                   new_code = ix86_reverse_condition (code, cmp_mode);
    3988            0 :                   if (compare_code != UNKNOWN && new_code != UNKNOWN)
    3989            0 :                     compare_code = reverse_condition (compare_code);
    3990              :                 }
    3991              : 
    3992            0 :               if (new_code != UNKNOWN)
    3993              :                 {
    3994            0 :                   cf = ct;
    3995            0 :                   ct = 0;
    3996            0 :                   code = new_code;
    3997              :                 }
    3998              :             }
    3999              : 
    4000            0 :           if (compare_code != UNKNOWN)
    4001              :             {
    4002              :               /* notl op1       (if needed)
    4003              :                  sarl $31, op1
    4004              :                  andl (cf-ct), op1
    4005              :                  addl ct, op1
    4006              : 
    4007              :                  For x < 0 (resp. x <= -1) there will be no notl,
    4008              :                  so if possible swap the constants to get rid of the
    4009              :                  complement.
    4010              :                  True/false will be -1/0 while code below (store flag
    4011              :                  followed by decrement) is 0/-1, so the constants need
    4012              :                  to be exchanged once more.  */
    4013              : 
    4014            0 :               if (compare_code == GE || !cf)
    4015              :                 {
    4016            0 :                   code = reverse_condition (code);
    4017            0 :                   compare_code = LT;
    4018              :                 }
    4019              :               else
    4020              :                 std::swap (ct, cf);
    4021              : 
    4022            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
    4023              :             }
    4024              :           else
    4025              :             {
    4026            0 :               out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
    4027              : 
    4028            0 :               out = expand_simple_binop (mode, PLUS, copy_rtx (out),
    4029              :                                          constm1_rtx,
    4030              :                                          copy_rtx (out), 1, OPTAB_DIRECT);
    4031              :             }
    4032              : 
    4033            0 :           HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
    4034              :           /* Make sure we can represent the difference
    4035              :              between the two values.  */
    4036            0 :           if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
    4037              :             return false;
    4038              : 
    4039            0 :           out = expand_simple_binop (mode, AND, copy_rtx (out),
    4040            0 :                                      gen_int_mode (ival, mode),
    4041              :                                      copy_rtx (out), 1, OPTAB_DIRECT);
    4042            0 :           if (ct)
    4043            0 :             out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
    4044              :                                        copy_rtx (out), 1, OPTAB_DIRECT);
    4045            0 :           if (!rtx_equal_p (out, operands[0]))
    4046            0 :             emit_move_insn (operands[0], copy_rtx (out));
    4047              : 
    4048            0 :           return true;
    4049              :         }
    4050              :     }
    4051              : 
    4052       402582 :   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
    4053              :     {
    4054              :       /* Try a few things more with specific constants and a variable.  */
    4055              : 
    4056            0 :       optab op;
    4057            0 :       rtx var, orig_out, out, tmp;
    4058              : 
    4059            0 :       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
    4060              :         return false;
    4061              : 
    4062            0 :       operands[2] = op2;
    4063            0 :       operands[3] = op3;
    4064              : 
    4065              :       /* If one of the two operands is an interesting constant, load a
    4066              :          constant with the above and mask it in with a logical operation.  */
    4067              : 
    4068            0 :       if (CONST_INT_P (operands[2]))
    4069              :         {
    4070            0 :           var = operands[3];
    4071            0 :           if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
    4072            0 :             operands[3] = constm1_rtx, op = and_optab;
    4073            0 :           else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
    4074            0 :             operands[3] = const0_rtx, op = ior_optab;
    4075              :           else
    4076              :             return false;
    4077              :         }
    4078            0 :       else if (CONST_INT_P (operands[3]))
    4079              :         {
    4080            0 :           var = operands[2];
    4081            0 :           if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
    4082              :             {
    4083              :               /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
    4084              :                  "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
    4085            0 :               if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
    4086            0 :                 operands[1] = simplify_gen_relational (LT, VOIDmode,
    4087            0 :                                                        GET_MODE (op0),
    4088              :                                                        op0, const0_rtx);
    4089              : 
    4090            0 :               operands[2] = constm1_rtx;
    4091            0 :               op = and_optab;
    4092              :             }
    4093            0 :           else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
    4094            0 :             operands[2] = const0_rtx, op = ior_optab;
    4095              :           else
    4096              :             return false;
    4097              :         }
    4098              :       else
    4099              :         return false;
    4100              : 
    4101            0 :       orig_out = operands[0];
    4102            0 :       tmp = gen_reg_rtx (mode);
    4103            0 :       operands[0] = tmp;
    4104              : 
    4105              :       /* Recurse to get the constant loaded.  */
    4106            0 :       if (!ix86_expand_int_movcc (operands))
    4107              :         return false;
    4108              : 
    4109              :       /* Mask in the interesting variable.  */
    4110            0 :       out = expand_binop (mode, op, var, tmp, orig_out, 0,
    4111              :                           OPTAB_WIDEN);
    4112            0 :       if (!rtx_equal_p (out, orig_out))
    4113            0 :         emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
    4114              : 
    4115            0 :       return true;
    4116              :     }
    4117              : 
    4118              :   /*
    4119              :    * For comparison with above,
    4120              :    *
    4121              :    * movl cf,dest
    4122              :    * movl ct,tmp
    4123              :    * cmpl op1,op2
    4124              :    * cmovcc tmp,dest
    4125              :    *
    4126              :    * Size 15.
    4127              :    */
    4128              : 
    4129       402582 :   if (! nonimmediate_operand (operands[2], mode))
    4130        27677 :     operands[2] = force_reg (mode, operands[2]);
    4131       402582 :   if (! nonimmediate_operand (operands[3], mode))
    4132       178197 :     operands[3] = force_reg (mode, operands[3]);
    4133              : 
    4134       402582 :   if (! register_operand (operands[2], VOIDmode)
    4135       402582 :       && (mode == QImode
    4136         1093 :           || ! register_operand (operands[3], VOIDmode)))
    4137         1564 :     operands[2] = force_reg (mode, operands[2]);
    4138              : 
    4139       402582 :   if (mode == QImode
    4140       402582 :       && ! register_operand (operands[3], VOIDmode))
    4141          592 :     operands[3] = force_reg (mode, operands[3]);
    4142              : 
    4143       402582 :   emit_insn (compare_seq);
    4144       402582 :   emit_insn (gen_rtx_SET (operands[0],
    4145              :                           gen_rtx_IF_THEN_ELSE (mode,
    4146              :                                                 compare_op, operands[2],
    4147              :                                                 operands[3])));
    4148       402582 :   return true;
    4149              : }
    4150              : 
    4151              : /* Detect conditional moves that exactly match min/max operational
    4152              :    semantics.  Note that this is IEEE safe, as long as we don't
    4153              :    interchange the operands.
    4154              : 
    4155              :    Returns FALSE if this conditional move doesn't match a MIN/MAX,
    4156              :    and TRUE if the operation is successful and instructions are emitted.  */
    4157              : 
    4158              : static bool
    4159         9779 : ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
    4160              :                            rtx cmp_op1, rtx if_true, rtx if_false)
    4161              : {
    4162         9779 :   machine_mode mode = GET_MODE (dest);
    4163         9779 :   bool is_min;
    4164         9779 :   rtx tmp;
    4165              : 
    4166         9779 :   if (code == LT)
    4167              :     ;
    4168         3290 :   else if (code == LE && !HONOR_NANS (mode))
    4169              :     {
    4170              :       /* We can swap LE to GE and then invert to LT.  */
    4171              :       std::swap (cmp_op0, cmp_op1);
    4172              :       std::swap (if_true, if_false);
    4173              :     }
    4174         3249 :   else if (code == UNGE)
    4175              :     std::swap (if_true, if_false);
    4176              :   else
    4177              :     return false;
    4178              : 
    4179         8653 :   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
    4180              :     is_min = true;
    4181         4595 :   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
    4182              :     is_min = false;
    4183              :   else
    4184         1026 :     return false;
    4185              : 
    4186         7627 :   if (immediate_operand (if_false, mode))
    4187            8 :     if_false = force_reg (mode, if_false);
    4188         7627 :   if (immediate_operand (if_true, mode))
    4189            0 :     if_true = force_reg (mode, if_true);
    4190              : 
    4191              :   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
    4192              :      but MODE may be a vector mode and thus not appropriate.  */
    4193         7627 :   if (!flag_finite_math_only || flag_signed_zeros)
    4194              :     {
    4195         7627 :       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
    4196         7627 :       rtvec v;
    4197              : 
    4198         7627 :       if_true = force_reg (mode, if_true);
    4199         7627 :       v = gen_rtvec (2, if_true, if_false);
    4200         7627 :       tmp = gen_rtx_UNSPEC (mode, v, u);
    4201         7627 :     }
    4202              :   else
    4203              :     {
    4204            0 :       code = is_min ? SMIN : SMAX;
    4205            0 :       if (MEM_P (if_true) && MEM_P (if_false))
    4206            0 :         if_true = force_reg (mode, if_true);
    4207            0 :       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
    4208              :     }
    4209              : 
    4210         7627 :   emit_insn (gen_rtx_SET (dest, tmp));
    4211         7627 :   return true;
    4212              : }
    4213              : 
    4214              : /* Return true if MODE is valid for vector compare to mask register,
    4215              :    Same result for conditionl vector move with mask register.  */
    4216              : static bool
    4217        14242 : ix86_valid_mask_cmp_mode (machine_mode mode)
    4218              : {
    4219              :   /* XOP has its own vector conditional movement.  */
    4220        14242 :   if (TARGET_XOP && !TARGET_AVX512F)
    4221              :     return false;
    4222              : 
    4223              :   /* HFmode only supports vcmpsh whose dest is mask register.  */
    4224        14236 :   if (TARGET_AVX512FP16 && mode == HFmode)
    4225              :     return true;
    4226              : 
    4227              :   /* AVX512F is needed for mask operation.  */
    4228        14144 :   if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
    4229              :     return false;
    4230              : 
    4231              :   /* AVX512BW is needed for vector QI/HImode,
    4232              :      AVX512VL is needed for 128/256-bit vector.  */
    4233          182 :   machine_mode inner_mode = GET_MODE_INNER (mode);
    4234          182 :   int vector_size = GET_MODE_SIZE (mode);
    4235          182 :   if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
    4236              :     return false;
    4237              : 
    4238          162 :   return vector_size == 64 || TARGET_AVX512VL;
    4239              : }
    4240              : 
    4241              : /* Return true if integer mask comparison should be used.  */
    4242              : static bool
    4243        50571 : ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
    4244              :                      rtx op_true, rtx op_false)
    4245              : {
    4246        50571 :   int vector_size = GET_MODE_SIZE (mode);
    4247              : 
    4248        50571 :   if (cmp_mode == HFmode)
    4249              :     return true;
    4250        50479 :   else if (vector_size < 16)
    4251              :     return false;
    4252        44083 :   else if (vector_size == 64)
    4253              :     return true;
    4254        88050 :   else if (GET_MODE_INNER (cmp_mode) == HFmode)
    4255              :     return true;
    4256        88050 :   else if (GET_MODE_INNER (cmp_mode) == BFmode)
    4257              :     return true;
    4258              : 
    4259              :   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
    4260        44025 :   gcc_assert (!op_true == !op_false);
    4261              : 
    4262              :   /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
    4263              :      vector dest is required.  */
    4264        44025 :   if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
    4265              :     return false;
    4266              : 
    4267              :   /* Exclude those that could be optimized in ix86_expand_sse_movcc.  */
    4268           48 :   if (op_false == CONST0_RTX (mode)
    4269           48 :       || op_true == CONST0_RTX (mode)
    4270           48 :       || (INTEGRAL_MODE_P (mode)
    4271           40 :           && (op_true == CONSTM1_RTX (mode)
    4272           40 :               || op_false == CONSTM1_RTX (mode))))
    4273            0 :     return false;
    4274              : 
    4275              :   return true;
    4276              : }
    4277              : 
    4278              : /* Expand an SSE comparison.  Return the register with the result.  */
    4279              : 
    4280              : static rtx
    4281        34119 : ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
    4282              :                      rtx op_true, rtx op_false)
    4283              : {
    4284        34119 :   machine_mode mode = GET_MODE (dest);
    4285        34119 :   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
    4286              : 
    4287              :   /* In general case result of comparison can differ from operands' type.  */
    4288        34119 :   machine_mode cmp_mode;
    4289              : 
    4290              :   /* In AVX512F the result of comparison is an integer mask.  */
    4291        34119 :   bool maskcmp = false;
    4292        34119 :   rtx x;
    4293              : 
    4294        34119 :   if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
    4295              :     {
    4296          145 :       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
    4297          145 :       maskcmp = true;
    4298          145 :       cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
    4299              :     }
    4300              :   else
    4301              :     cmp_mode = cmp_ops_mode;
    4302              : 
    4303        34119 :   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
    4304              : 
    4305        68238 :   bool (*op1_predicate)(rtx, machine_mode)
    4306        34119 :     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
    4307              : 
    4308        34119 :   if (!op1_predicate (cmp_op1, cmp_ops_mode))
    4309            0 :     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
    4310              : 
    4311        34119 :   if (optimize
    4312          505 :       || (maskcmp && cmp_mode != mode)
    4313          505 :       || (op_true && reg_overlap_mentioned_p (dest, op_true))
    4314        34624 :       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
    4315        67083 :     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
    4316              : 
    4317        34119 :   if (maskcmp)
    4318              :     {
    4319          145 :       bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
    4320          145 :       gcc_assert (ok);
    4321              :       return dest;
    4322              :     }
    4323              : 
    4324        33974 :   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
    4325              : 
    4326        33974 :   if (cmp_mode != mode)
    4327              :     {
    4328         6701 :       x = force_reg (cmp_ops_mode, x);
    4329         6701 :       convert_move (dest, x, false);
    4330              :     }
    4331              :   else
    4332        27273 :     emit_insn (gen_rtx_SET (dest, x));
    4333              : 
    4334              :   return dest;
    4335              : }
    4336              : 
    4337              : /* Emit x86 binary operand CODE in mode MODE for SSE vector
    4338              :    instructions that can be performed using GP registers.  */
    4339              : 
    4340              : static void
    4341         7055 : ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
    4342              :                      rtx dst, rtx src1, rtx src2)
    4343              : {
    4344         7055 :   rtx tmp;
    4345              : 
    4346         7055 :   tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
    4347              : 
    4348         7055 :   if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
    4349         7055 :       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
    4350              :     {
    4351           94 :       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    4352           94 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
    4353              :     }
    4354              : 
    4355         7055 :   emit_insn (tmp);
    4356         7055 : }
    4357              : 
    4358              : /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
    4359              :    operations.  This is used for both scalar and vector conditional moves.  */
    4360              : 
    4361              : void
    4362        10141 : ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
    4363              : {
    4364        10141 :   machine_mode mode = GET_MODE (dest);
    4365        10141 :   machine_mode cmpmode = GET_MODE (cmp);
    4366        10141 :   rtx x;
    4367              : 
    4368              :   /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
    4369        10141 :   if (rtx_equal_p (op_true, op_false))
    4370              :     {
    4371            0 :       emit_move_insn (dest, op_true);
    4372            0 :       return;
    4373              :     }
    4374              : 
    4375              :   /* If we have an integer mask and FP value then we need
    4376              :      to cast mask to FP mode.  */
    4377        10141 :   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
    4378              :     {
    4379         1483 :       cmp = force_reg (cmpmode, cmp);
    4380         1483 :       cmp = gen_rtx_SUBREG (mode, cmp, 0);
    4381              :     }
    4382              : 
    4383              :   /* In AVX512F the result of comparison is an integer mask.  */
    4384        10141 :   if (mode != cmpmode
    4385         1628 :       && GET_MODE_CLASS (cmpmode) == MODE_INT)
    4386              :     {
    4387          145 :       gcc_assert (ix86_valid_mask_cmp_mode (mode));
    4388              :       /* Using scalar/vector move with mask register.  */
    4389          145 :       cmp = force_reg (cmpmode, cmp);
    4390              :       /* Optimize for mask zero.  */
    4391          290 :       op_true = (op_true != CONST0_RTX (mode)
    4392          145 :                  ? force_reg (mode, op_true) : op_true);
    4393          290 :       op_false = (op_false != CONST0_RTX (mode)
    4394          145 :                   ? force_reg (mode, op_false) : op_false);
    4395          145 :       if (op_true == CONST0_RTX (mode))
    4396              :         {
    4397            0 :           if (cmpmode == E_DImode && !TARGET_64BIT)
    4398              :             {
    4399            0 :               x = gen_reg_rtx (cmpmode);
    4400            0 :               emit_insn (gen_knotdi (x, cmp));
    4401              :             }
    4402              :           else
    4403            0 :             x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
    4404              :           cmp = x;
    4405              :           /* Reverse op_true op_false.  */
    4406              :           std::swap (op_true, op_false);
    4407              :         }
    4408              : 
    4409          145 :       if (mode == HFmode)
    4410           92 :         emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
    4411              :       else
    4412           53 :         emit_insn (gen_rtx_SET (dest,
    4413              :                                 gen_rtx_VEC_MERGE (mode,
    4414              :                                                    op_true, op_false, cmp)));
    4415          145 :       return;
    4416              :     }
    4417              : 
    4418         9996 :   if (vector_all_ones_operand (op_true, mode)
    4419         9996 :       && op_false == CONST0_RTX (mode))
    4420              :     {
    4421            2 :       emit_move_insn (dest, cmp);
    4422            2 :       return;
    4423              :     }
    4424         9994 :   else if (op_false == CONST0_RTX (mode))
    4425              :     {
    4426          907 :       x = expand_simple_binop (mode, AND, cmp, op_true,
    4427              :                                dest, 1, OPTAB_DIRECT);
    4428          907 :       if (x != dest)
    4429            0 :         emit_move_insn (dest, x);
    4430          907 :       return;
    4431              :     }
    4432         9087 :   else if (op_true == CONST0_RTX (mode))
    4433              :     {
    4434           94 :       op_false = force_reg (mode, op_false);
    4435           94 :       x = gen_rtx_NOT (mode, cmp);
    4436           94 :       ix86_emit_vec_binop (AND, mode, dest, x, op_false);
    4437           94 :       return;
    4438              :     }
    4439         8993 :   else if (vector_all_ones_operand (op_true, mode))
    4440              :     {
    4441            0 :       x = expand_simple_binop (mode, IOR, cmp, op_false,
    4442              :                                dest, 1, OPTAB_DIRECT);
    4443            0 :       if (x != dest)
    4444            0 :         emit_move_insn (dest, x);
    4445            0 :       return;
    4446              :     }
    4447              : 
    4448         8993 :   if (TARGET_XOP)
    4449              :     {
    4450           65 :       op_true = force_reg (mode, op_true);
    4451              : 
    4452           65 :       if (GET_MODE_SIZE (mode) < 16
    4453           65 :           || !nonimmediate_operand (op_false, mode))
    4454           49 :         op_false = force_reg (mode, op_false);
    4455              : 
    4456           65 :       emit_insn (gen_rtx_SET (dest,
    4457              :                               gen_rtx_IF_THEN_ELSE (mode, cmp,
    4458              :                                                     op_true, op_false)));
    4459           65 :       return;
    4460              :     }
    4461              : 
    4462         8928 :   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
    4463         8928 :   machine_mode blend_mode = mode;
    4464              : 
    4465         8928 :   if (GET_MODE_SIZE (mode) < 16
    4466         8928 :       || !vector_operand (op_true, mode))
    4467         2419 :     op_true = force_reg (mode, op_true);
    4468              : 
    4469         8928 :   op_false = force_reg (mode, op_false);
    4470              : 
    4471         8928 :   switch (mode)
    4472              :     {
    4473           29 :     case E_V2SFmode:
    4474           29 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4475              :         gen = gen_mmx_blendvps;
    4476              :       break;
    4477          289 :     case E_V4SFmode:
    4478          289 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4479              :         gen = gen_sse4_1_blendvps;
    4480              :       break;
    4481          132 :     case E_V2DFmode:
    4482          132 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4483              :         gen = gen_sse4_1_blendvpd;
    4484              :       break;
    4485         1093 :     case E_SFmode:
    4486         1093 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4487              :         gen = gen_sse4_1_blendvss;
    4488              :       break;
    4489          824 :     case E_DFmode:
    4490          824 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4491              :         gen = gen_sse4_1_blendvsd;
    4492              :       break;
    4493          350 :     case E_V8QImode:
    4494          350 :     case E_V4HImode:
    4495          350 :     case E_V4HFmode:
    4496          350 :     case E_V4BFmode:
    4497          350 :     case E_V2SImode:
    4498          350 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4499              :         {
    4500              :           gen = gen_mmx_pblendvb_v8qi;
    4501              :           blend_mode = V8QImode;
    4502              :         }
    4503              :       break;
    4504           87 :     case E_V4QImode:
    4505           87 :     case E_V2HImode:
    4506           87 :     case E_V2HFmode:
    4507           87 :     case E_V2BFmode:
    4508           87 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4509              :         {
    4510              :           gen = gen_mmx_pblendvb_v4qi;
    4511              :           blend_mode = V4QImode;
    4512              :         }
    4513              :       break;
    4514           36 :     case E_V2QImode:
    4515           36 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4516              :         gen = gen_mmx_pblendvb_v2qi;
    4517              :       break;
    4518         5425 :     case E_V16QImode:
    4519         5425 :     case E_V8HImode:
    4520         5425 :     case E_V8HFmode:
    4521         5425 :     case E_V8BFmode:
    4522         5425 :     case E_V4SImode:
    4523         5425 :     case E_V2DImode:
    4524         5425 :     case E_V1TImode:
    4525         5425 :       if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
    4526              :         {
    4527              :           gen = gen_sse4_1_pblendvb;
    4528              :           blend_mode = V16QImode;
    4529              :         }
    4530              :       break;
    4531           91 :     case E_V8SFmode:
    4532           91 :       if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
    4533              :         gen = gen_avx_blendvps256;
    4534              :       break;
    4535          192 :     case E_V4DFmode:
    4536          192 :       if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
    4537              :         gen = gen_avx_blendvpd256;
    4538              :       break;
    4539          380 :     case E_V32QImode:
    4540          380 :     case E_V16HImode:
    4541          380 :     case E_V16HFmode:
    4542          380 :     case E_V16BFmode:
    4543          380 :     case E_V8SImode:
    4544          380 :     case E_V4DImode:
    4545          380 :       if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
    4546              :         {
    4547              :           gen = gen_avx2_pblendvb;
    4548              :           blend_mode = V32QImode;
    4549              :         }
    4550              :       break;
    4551              : 
    4552            0 :     case E_V64QImode:
    4553            0 :       gen = gen_avx512bw_blendmv64qi;
    4554            0 :       break;
    4555            0 :     case E_V32HImode:
    4556            0 :       gen = gen_avx512bw_blendmv32hi;
    4557            0 :       break;
    4558            0 :     case E_V32HFmode:
    4559            0 :       gen = gen_avx512bw_blendmv32hf;
    4560            0 :       break;
    4561            0 :     case E_V32BFmode:
    4562            0 :       gen = gen_avx512bw_blendmv32bf;
    4563            0 :       break;
    4564            0 :     case E_V16SImode:
    4565            0 :       gen = gen_avx512f_blendmv16si;
    4566            0 :       break;
    4567            0 :     case E_V8DImode:
    4568            0 :       gen = gen_avx512f_blendmv8di;
    4569            0 :       break;
    4570            0 :     case E_V8DFmode:
    4571            0 :       gen = gen_avx512f_blendmv8df;
    4572            0 :       break;
    4573              :     case E_V16SFmode:
    4574              :       gen = gen_avx512f_blendmv16sf;
    4575              :       break;
    4576              : 
    4577              :     default:
    4578              :       break;
    4579              :     }
    4580              : 
    4581            0 :   if (gen != NULL)
    4582              :     {
    4583         2068 :       if (blend_mode == mode)
    4584              :         x = dest;
    4585              :       else
    4586              :         {
    4587         1005 :           x = gen_reg_rtx (blend_mode);
    4588         1005 :           op_false = gen_lowpart (blend_mode, op_false);
    4589         1005 :           op_true = gen_lowpart (blend_mode, op_true);
    4590         1005 :           cmp = gen_lowpart (blend_mode, cmp);
    4591              :         }
    4592              : 
    4593         2068 :       emit_insn (gen (x, op_false, op_true, cmp));
    4594              : 
    4595         2068 :       if (x != dest)
    4596         1005 :         emit_move_insn (dest, gen_lowpart (mode, x));
    4597              :     }
    4598              :   else
    4599              :     {
    4600         6860 :       rtx t2, t3;
    4601              : 
    4602         6860 :       t2 = expand_simple_binop (mode, AND, op_true, cmp,
    4603              :                                 NULL, 1, OPTAB_DIRECT);
    4604              : 
    4605         6860 :       t3 = gen_reg_rtx (mode);
    4606         6860 :       x = gen_rtx_NOT (mode, cmp);
    4607         6860 :       ix86_emit_vec_binop (AND, mode, t3, x, op_false);
    4608              : 
    4609         6860 :       x = expand_simple_binop (mode, IOR, t3, t2,
    4610              :                                dest, 1, OPTAB_DIRECT);
    4611         6860 :       if (x != dest)
    4612            0 :         emit_move_insn (dest, x);
    4613              :     }
    4614              : }
    4615              : 
    4616              : /* Swap, force into registers, or otherwise massage the two operands
    4617              :    to an sse comparison with a mask result.  Thus we differ a bit from
    4618              :    ix86_prepare_fp_compare_args which expects to produce a flags result.
    4619              : 
    4620              :    The DEST operand exists to help determine whether to commute commutative
    4621              :    operators.  The POP0/POP1 operands are updated in place.  The new
    4622              :    comparison code is returned, or UNKNOWN if not implementable.  */
    4623              : 
    4624              : static enum rtx_code
    4625        16480 : ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
    4626              :                                   rtx *pop0, rtx *pop1)
    4627              : {
    4628        16480 :   switch (code)
    4629              :     {
    4630           67 :     case LTGT:
    4631           67 :     case UNEQ:
    4632              :       /* AVX supports all the needed comparisons.  */
    4633           67 :       if (TARGET_AVX)
    4634              :         break;
    4635              :       /* We have no LTGT as an operator.  We could implement it with
    4636              :          NE & ORDERED, but this requires an extra temporary.  It's
    4637              :          not clear that it's worth it.  */
    4638              :       return UNKNOWN;
    4639              : 
    4640              :     case LT:
    4641              :     case LE:
    4642              :     case UNGT:
    4643              :     case UNGE:
    4644              :       /* These are supported directly.  */
    4645              :       break;
    4646              : 
    4647         4923 :     case EQ:
    4648         4923 :     case NE:
    4649         4923 :     case UNORDERED:
    4650         4923 :     case ORDERED:
    4651              :       /* AVX has 3 operand comparisons, no need to swap anything.  */
    4652         4923 :       if (TARGET_AVX)
    4653              :         break;
    4654              :       /* For commutative operators, try to canonicalize the destination
    4655              :          operand to be first in the comparison - this helps reload to
    4656              :          avoid extra moves.  */
    4657          344 :       if (!dest || !rtx_equal_p (dest, *pop1))
    4658              :         break;
    4659              :       /* FALLTHRU */
    4660              : 
    4661        10559 :     case GE:
    4662        10559 :     case GT:
    4663        10559 :     case UNLE:
    4664        10559 :     case UNLT:
    4665              :       /* These are not supported directly before AVX, and furthermore
    4666              :          ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
    4667              :          comparison operands to transform into something that is
    4668              :          supported.  */
    4669        10559 :       std::swap (*pop0, *pop1);
    4670        10559 :       code = swap_condition (code);
    4671        10559 :       break;
    4672              : 
    4673            0 :     default:
    4674            0 :       gcc_unreachable ();
    4675              :     }
    4676              : 
    4677              :   return code;
    4678              : }
    4679              : 
    4680              : /* Expand a floating-point conditional move.  Return true if successful.  */
    4681              : 
    4682              : bool
    4683        96053 : ix86_expand_fp_movcc (rtx operands[])
    4684              : {
    4685        96053 :   machine_mode mode = GET_MODE (operands[0]);
    4686        96053 :   enum rtx_code code = GET_CODE (operands[1]);
    4687        96053 :   rtx tmp, compare_op;
    4688        96053 :   rtx op0 = XEXP (operands[1], 0);
    4689        96053 :   rtx op1 = XEXP (operands[1], 1);
    4690              : 
    4691        96053 :   if (GET_MODE (op0) == BFmode
    4692        96053 :       && !ix86_fp_comparison_operator (operands[1], VOIDmode))
    4693              :     return false;
    4694              : 
    4695        96053 :   if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
    4696              :     {
    4697        65535 :       machine_mode cmode;
    4698              : 
    4699              :       /* Since we've no cmove for sse registers, don't force bad register
    4700              :          allocation just to gain access to it.  Deny movcc when the
    4701              :          comparison mode doesn't match the move mode.  */
    4702        65535 :       cmode = GET_MODE (op0);
    4703        65535 :       if (cmode == VOIDmode)
    4704            0 :         cmode = GET_MODE (op1);
    4705        65535 :       if (cmode != mode)
    4706              :         return false;
    4707              : 
    4708         9799 :       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
    4709         9799 :       if (code == UNKNOWN)
    4710              :         return false;
    4711              : 
    4712         9779 :       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
    4713              :                                      operands[2], operands[3]))
    4714              :         return true;
    4715              : 
    4716         2152 :       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
    4717              :                                  operands[2], operands[3]);
    4718         2152 :       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
    4719         2152 :       return true;
    4720              :     }
    4721              : 
    4722        30518 :   if (GET_MODE (op0) == TImode
    4723        30518 :       || (GET_MODE (op0) == DImode
    4724           72 :           && !TARGET_64BIT))
    4725              :     return false;
    4726              : 
    4727              :   /* The floating point conditional move instructions don't directly
    4728              :      support conditions resulting from a signed integer comparison.  */
    4729              : 
    4730        30446 :   compare_op = ix86_expand_compare (code, op0, op1);
    4731        30446 :   if (!fcmov_comparison_operator (compare_op, VOIDmode))
    4732              :     {
    4733          146 :       tmp = gen_reg_rtx (QImode);
    4734          146 :       ix86_expand_setcc (tmp, code, op0, op1);
    4735              : 
    4736          146 :       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
    4737              :     }
    4738              : 
    4739        30446 :   operands[2] = force_reg (mode, operands[2]);
    4740        30446 :   operands[3] = force_reg (mode, operands[3]);
    4741        30446 :   emit_insn (gen_rtx_SET (operands[0],
    4742              :                           gen_rtx_IF_THEN_ELSE (mode, compare_op,
    4743              :                                                 operands[2], operands[3])));
    4744              : 
    4745        30446 :   return true;
    4746              : }
    4747              : 
    4748              : /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
    4749              : 
    4750              : static int
    4751         4854 : ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
    4752              : {
    4753         4854 :   switch (code)
    4754              :     {
    4755              :     case EQ:
    4756              :       return 0;
    4757          377 :     case LT:
    4758          377 :     case LTU:
    4759          377 :       return 1;
    4760          212 :     case LE:
    4761          212 :     case LEU:
    4762          212 :       return 2;
    4763         3051 :     case NE:
    4764         3051 :       return 4;
    4765          307 :     case GE:
    4766          307 :     case GEU:
    4767          307 :       return 5;
    4768          498 :     case GT:
    4769          498 :     case GTU:
    4770          498 :       return 6;
    4771            0 :     default:
    4772            0 :       gcc_unreachable ();
    4773              :     }
    4774              : }
    4775              : 
    4776              : /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
    4777              : 
    4778              : static int
    4779         1781 : ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
    4780              : {
    4781         1781 :   switch (code)
    4782              :     {
    4783              :     case EQ:
    4784              :       return 0x00;
    4785          354 :     case NE:
    4786          354 :       return 0x04;
    4787          514 :     case GT:
    4788          514 :       return 0x0e;
    4789           88 :     case LE:
    4790           88 :       return 0x02;
    4791           53 :     case GE:
    4792           53 :       return 0x0d;
    4793          620 :     case LT:
    4794          620 :       return 0x01;
    4795            2 :     case UNLE:
    4796            2 :       return 0x0a;
    4797            2 :     case UNLT:
    4798            2 :       return 0x09;
    4799           11 :     case UNGE:
    4800           11 :       return 0x05;
    4801           44 :     case UNGT:
    4802           44 :       return 0x06;
    4803            2 :     case UNEQ:
    4804            2 :       return 0x18;
    4805            0 :     case LTGT:
    4806            0 :       return 0x0c;
    4807            2 :     case ORDERED:
    4808            2 :       return 0x07;
    4809            2 :     case UNORDERED:
    4810            2 :       return 0x03;
    4811            0 :     default:
    4812            0 :       gcc_unreachable ();
    4813              :     }
    4814              : }
    4815              : 
    4816              : /* Return immediate value to be used in UNSPEC_PCMP
    4817              :    for comparison CODE in MODE.  */
    4818              : 
    4819              : static int
    4820         6635 : ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
    4821              : {
    4822         6635 :   if (FLOAT_MODE_P (mode))
    4823         1781 :     return ix86_fp_cmp_code_to_pcmp_immediate (code);
    4824         4854 :   return ix86_int_cmp_code_to_pcmp_immediate (code);
    4825              : }
    4826              : 
    4827              : /* Expand AVX-512 vector comparison.  */
    4828              : 
    4829              : bool
    4830         6635 : ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
    4831              : {
    4832         6635 :   machine_mode mask_mode = GET_MODE (dest);
    4833         6635 :   machine_mode cmp_mode = GET_MODE (cmp_op0);
    4834         6635 :   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
    4835         6635 :   int unspec_code;
    4836         6635 :   rtx unspec;
    4837              : 
    4838         6635 :   switch (code)
    4839              :     {
    4840              :     case LEU:
    4841              :     case GTU:
    4842              :     case GEU:
    4843              :     case LTU:
    4844              :       unspec_code = UNSPEC_UNSIGNED_PCMP;
    4845              :       break;
    4846              : 
    4847         6221 :     default:
    4848         6221 :       unspec_code = UNSPEC_PCMP;
    4849              :     }
    4850              : 
    4851         6635 :   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
    4852              :                            unspec_code);
    4853         6635 :   emit_insn (gen_rtx_SET (dest, unspec));
    4854              : 
    4855         6635 :   return true;
    4856              : }
    4857              : 
    4858              : /* Expand fp vector comparison.  */
    4859              : 
    4860              : bool
    4861         6681 : ix86_expand_fp_vec_cmp (rtx operands[])
    4862              : {
    4863         6681 :   enum rtx_code code = GET_CODE (operands[1]);
    4864         6681 :   rtx cmp;
    4865              : 
    4866         6681 :   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
    4867              :                                            &operands[2], &operands[3]);
    4868         6681 :   if (code == UNKNOWN)
    4869              :     {
    4870           20 :       rtx temp;
    4871           20 :       switch (GET_CODE (operands[1]))
    4872              :         {
    4873            2 :         case LTGT:
    4874            2 :           temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
    4875              :                                       operands[3], NULL, NULL);
    4876            2 :           cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
    4877              :                                      operands[3], NULL, NULL);
    4878            2 :           code = AND;
    4879            2 :           break;
    4880           18 :         case UNEQ:
    4881           18 :           temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
    4882              :                                       operands[3], NULL, NULL);
    4883           18 :           cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
    4884              :                                      operands[3], NULL, NULL);
    4885           18 :           code = IOR;
    4886           18 :           break;
    4887            0 :         default:
    4888            0 :           gcc_unreachable ();
    4889              :         }
    4890           20 :       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
    4891              :                                  OPTAB_DIRECT);
    4892              :     }
    4893              :   else
    4894         6661 :     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
    4895              :                                NULL, NULL);
    4896              : 
    4897         6681 :   if (operands[0] != cmp)
    4898         6598 :     emit_move_insn (operands[0], cmp);
    4899              : 
    4900         6681 :   return true;
    4901              : }
    4902              : 
    4903              : static rtx
    4904        16650 : ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
    4905              :                          rtx op_true, rtx op_false, bool *negate)
    4906              : {
    4907        16650 :   machine_mode data_mode = GET_MODE (dest);
    4908        16650 :   machine_mode mode = GET_MODE (cop0);
    4909        16650 :   rtx x;
    4910              : 
    4911        16650 :   *negate = false;
    4912              : 
    4913              :   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
    4914        16650 :   if (TARGET_XOP
    4915          201 :       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
    4916        16851 :       && GET_MODE_SIZE (mode) <= 16)
    4917              :     ;
    4918              :   /* AVX512F supports all of the comparsions
    4919              :      on all 128/256/512-bit vector int types.  */
    4920        16452 :   else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
    4921              :     ;
    4922              :   else
    4923              :     {
    4924              :       /* Canonicalize the comparison to EQ, GT, GTU.  */
    4925        16399 :       switch (code)
    4926              :         {
    4927              :         case EQ:
    4928              :         case GT:
    4929              :         case GTU:
    4930              :           break;
    4931              : 
    4932          832 :         case LE:
    4933          832 :         case LEU:
    4934              :           /* x <= cst can be handled as x < cst + 1 unless there is
    4935              :              wrap around in cst + 1.  */
    4936          832 :           if (CONST_VECTOR_P (cop1)
    4937         1394 :               && GET_MODE_INNER (mode) != TImode)
    4938              :             {
    4939          562 :               unsigned int n_elts = GET_MODE_NUNITS (mode), i;
    4940          562 :               machine_mode eltmode = GET_MODE_INNER (mode);
    4941         3515 :               for (i = 0; i < n_elts; ++i)
    4942              :                 {
    4943         2954 :                   rtx elt = CONST_VECTOR_ELT (cop1, i);
    4944         2954 :                   if (!CONST_INT_P (elt))
    4945              :                     break;
    4946         2954 :                   if (code == LE)
    4947              :                     {
    4948              :                       /* For LE punt if some element is signed maximum.  */
    4949         1874 :                       if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
    4950              :                           == (GET_MODE_MASK (eltmode) >> 1))
    4951              :                         break;
    4952              :                     }
    4953              :                   /* For LEU punt if some element is unsigned maximum.  */
    4954         1080 :                   else if (elt == constm1_rtx)
    4955              :                     break;
    4956              :                 }
    4957          562 :               if (i == n_elts)
    4958              :                 {
    4959          561 :                   rtvec v = rtvec_alloc (n_elts);
    4960         4074 :                   for (i = 0; i < n_elts; ++i)
    4961         2952 :                     RTVEC_ELT (v, i)
    4962         2952 :                       = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
    4963              :                                       eltmode);
    4964          561 :                   cop1 = gen_rtx_CONST_VECTOR (mode, v);
    4965          561 :                   std::swap (cop0, cop1);
    4966          561 :                   code = code == LE ? GT : GTU;
    4967              :                   break;
    4968              :                 }
    4969              :             }
    4970              :           /* FALLTHRU */
    4971         3525 :         case NE:
    4972         3525 :           code = reverse_condition (code);
    4973         3525 :           *negate = true;
    4974         3525 :           break;
    4975              : 
    4976          408 :         case GE:
    4977          408 :         case GEU:
    4978              :           /* x >= cst can be handled as x > cst - 1 unless there is
    4979              :              wrap around in cst - 1.  */
    4980          408 :           if (CONST_VECTOR_P (cop1)
    4981          579 :               && GET_MODE_INNER (mode) != TImode)
    4982              :             {
    4983          171 :               unsigned int n_elts = GET_MODE_NUNITS (mode), i;
    4984          171 :               machine_mode eltmode = GET_MODE_INNER (mode);
    4985         1145 :               for (i = 0; i < n_elts; ++i)
    4986              :                 {
    4987         1022 :                   rtx elt = CONST_VECTOR_ELT (cop1, i);
    4988         1022 :                   if (!CONST_INT_P (elt))
    4989              :                     break;
    4990         1022 :                   if (code == GE)
    4991              :                     {
    4992              :                       /* For GE punt if some element is signed minimum.  */
    4993          974 :                       if (INTVAL (elt) < 0
    4994          136 :                           && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
    4995              :                               == 0))
    4996              :                         break;
    4997              :                     }
    4998              :                   /* For GEU punt if some element is zero.  */
    4999           48 :                   else if (elt == const0_rtx)
    5000              :                     break;
    5001              :                 }
    5002          171 :               if (i == n_elts)
    5003              :                 {
    5004          123 :                   rtvec v = rtvec_alloc (n_elts);
    5005         1220 :                   for (i = 0; i < n_elts; ++i)
    5006          974 :                     RTVEC_ELT (v, i)
    5007          974 :                       = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
    5008              :                                       eltmode);
    5009          123 :                   cop1 = gen_rtx_CONST_VECTOR (mode, v);
    5010          123 :                   code = code == GE ? GT : GTU;
    5011              :                   break;
    5012              :                 }
    5013              :             }
    5014          285 :           code = reverse_condition (code);
    5015          285 :           *negate = true;
    5016              :           /* FALLTHRU */
    5017              : 
    5018         1580 :         case LT:
    5019         1580 :         case LTU:
    5020         1580 :           std::swap (cop0, cop1);
    5021         1580 :           code = swap_condition (code);
    5022         1580 :           break;
    5023              : 
    5024            0 :         default:
    5025            0 :           gcc_unreachable ();
    5026              :         }
    5027              : 
    5028              :       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
    5029        16399 :       if (mode == V2DImode)
    5030              :         {
    5031          787 :           switch (code)
    5032              :             {
    5033          583 :             case EQ:
    5034              :               /* SSE4.1 supports EQ.  */
    5035          583 :               if (!TARGET_SSE4_1)
    5036        16650 :                 return NULL;
    5037              :               break;
    5038              : 
    5039          204 :             case GT:
    5040          204 :             case GTU:
    5041              :               /* SSE4.2 supports GT/GTU.  */
    5042          204 :               if (!TARGET_SSE4_2)
    5043              :                 return NULL;
    5044              :               break;
    5045              : 
    5046            0 :             default:
    5047            0 :               gcc_unreachable ();
    5048              :             }
    5049              :         }
    5050              : 
    5051        16399 :       if (CONST_VECTOR_P (cop0))
    5052         1177 :         cop0 = force_reg (mode, cop0);
    5053        15222 :       else if (CONST_VECTOR_P (cop1))
    5054         7265 :         cop1 = force_reg (mode, cop1);
    5055              : 
    5056        16399 :       rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
    5057        16399 :       rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
    5058        16399 :       if (*negate)
    5059         3810 :         std::swap (optrue, opfalse);
    5060              : 
    5061              :       /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
    5062              :          not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
    5063              :          min (x, y) == x).  While we add one instruction (the minimum),
    5064              :          we remove the need for two instructions in the negation, as the
    5065              :          result is done this way.
    5066              :          When using masks, do it for SI/DImode element types, as it is shorter
    5067              :          than the two subtractions.  */
    5068        16399 :       if ((code != EQ
    5069         6976 :            && GET_MODE_SIZE (mode) != 64
    5070         6976 :            && vector_all_ones_operand (opfalse, data_mode)
    5071          556 :            && optrue == CONST0_RTX (data_mode))
    5072        22819 :           || (code == GTU
    5073         1894 :               && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
    5074              :               /* Don't do it if not using integer masks and we'd end up with
    5075              :                  the right values in the registers though.  */
    5076          631 :               && (GET_MODE_SIZE (mode) == 64
    5077          631 :                   || !vector_all_ones_operand (optrue, data_mode)
    5078          522 :                   || opfalse != CONST0_RTX (data_mode))))
    5079              :         {
    5080          665 :           rtx (*gen) (rtx, rtx, rtx) = NULL;
    5081              : 
    5082          665 :           switch (mode)
    5083              :             {
    5084            0 :             case E_V16SImode:
    5085            0 :               gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
    5086              :               break;
    5087            0 :             case E_V8DImode:
    5088            0 :               gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
    5089            0 :               cop0 = force_reg (mode, cop0);
    5090            0 :               cop1 = force_reg (mode, cop1);
    5091            0 :               break;
    5092           24 :             case E_V32QImode:
    5093           24 :               if (TARGET_AVX2)
    5094           24 :                 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
    5095              :               break;
    5096           24 :             case E_V16HImode:
    5097           24 :               if (TARGET_AVX2)
    5098           24 :                 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
    5099              :               break;
    5100           26 :             case E_V8SImode:
    5101           26 :               if (TARGET_AVX2)
    5102           26 :                 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
    5103              :               break;
    5104           20 :             case E_V4DImode:
    5105           20 :               if (TARGET_AVX512VL)
    5106              :                 {
    5107            0 :                   gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
    5108            0 :                   cop0 = force_reg (mode, cop0);
    5109            0 :                   cop1 = force_reg (mode, cop1);
    5110              :                 }
    5111              :               break;
    5112           59 :             case E_V16QImode:
    5113           59 :               if (code == GTU && TARGET_SSE2)
    5114              :                 gen = gen_uminv16qi3;
    5115           20 :               else if (code == GT && TARGET_SSE4_1)
    5116              :                 gen = gen_sminv16qi3;
    5117              :               break;
    5118           40 :             case E_V8QImode:
    5119           40 :               if (code == GTU && TARGET_SSE2)
    5120              :                 gen = gen_uminv8qi3;
    5121           38 :               else if (code == GT && TARGET_SSE4_1)
    5122              :                 gen = gen_sminv8qi3;
    5123              :               break;
    5124           13 :             case E_V4QImode:
    5125           13 :               if (code == GTU && TARGET_SSE2)
    5126              :                 gen = gen_uminv4qi3;
    5127            2 :               else if (code == GT && TARGET_SSE4_1)
    5128              :                 gen = gen_sminv4qi3;
    5129              :               break;
    5130            8 :             case E_V2QImode:
    5131            8 :               if (code == GTU && TARGET_SSE2)
    5132              :                 gen = gen_uminv2qi3;
    5133            6 :               else if (code == GT && TARGET_SSE4_1)
    5134              :                 gen = gen_sminv2qi3;
    5135              :               break;
    5136           68 :             case E_V8HImode:
    5137           68 :               if (code == GTU && TARGET_SSE4_1)
    5138              :                 gen = gen_uminv8hi3;
    5139           58 :               else if (code == GT && TARGET_SSE2)
    5140              :                 gen = gen_sminv8hi3;
    5141              :               break;
    5142            4 :             case E_V4HImode:
    5143            4 :               if (code == GTU && TARGET_SSE4_1)
    5144              :                 gen = gen_uminv4hi3;
    5145            4 :               else if (code == GT && TARGET_SSE2)
    5146              :                 gen = gen_sminv4hi3;
    5147              :               break;
    5148           16 :             case E_V2HImode:
    5149           16 :               if (code == GTU && TARGET_SSE4_1)
    5150              :                 gen = gen_uminv2hi3;
    5151           16 :               else if (code == GT && TARGET_SSE2)
    5152              :                 gen = gen_sminv2hi3;
    5153              :               break;
    5154          238 :             case E_V4SImode:
    5155          238 :               if (TARGET_SSE4_1)
    5156           56 :                 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
    5157              :               break;
    5158          101 :             case E_V2SImode:
    5159          101 :               if (TARGET_SSE4_1)
    5160            0 :                 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
    5161              :               break;
    5162           24 :             case E_V2DImode:
    5163           24 :               if (TARGET_AVX512VL)
    5164              :                 {
    5165            0 :                   gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
    5166            0 :                   cop0 = force_reg (mode, cop0);
    5167            0 :                   cop1 = force_reg (mode, cop1);
    5168              :                 }
    5169              :               break;
    5170              :             default:
    5171              :               break;
    5172              :             }
    5173              : 
    5174            0 :           if (gen)
    5175              :             {
    5176          283 :               rtx tem = gen_reg_rtx (mode);
    5177          283 :               if (!vector_operand (cop0, mode))
    5178            0 :                 cop0 = force_reg (mode, cop0);
    5179          283 :               if (!vector_operand (cop1, mode))
    5180            0 :                 cop1 = force_reg (mode, cop1);
    5181          283 :               *negate = !*negate;
    5182          283 :               emit_insn (gen (tem, cop0, cop1));
    5183          283 :               cop1 = tem;
    5184          283 :               code = EQ;
    5185              :             }
    5186              :         }
    5187              : 
    5188              :       /* Unsigned parallel compare is not supported by the hardware.
    5189              :          Play some tricks to turn this into a signed comparison
    5190              :          against 0.  */
    5191        16399 :       if (code == GTU)
    5192              :         {
    5193         1077 :           cop0 = force_reg (mode, cop0);
    5194              : 
    5195         1077 :           switch (mode)
    5196              :             {
    5197          730 :             case E_V16SImode:
    5198          730 :             case E_V8DImode:
    5199          730 :             case E_V8SImode:
    5200          730 :             case E_V4DImode:
    5201          730 :             case E_V4SImode:
    5202          730 :             case E_V2SImode:
    5203          730 :             case E_V2DImode:
    5204          730 :                 {
    5205          730 :                   rtx t1, t2, mask;
    5206              : 
    5207              :                   /* Subtract (-(INT MAX) - 1) from both operands to make
    5208              :                      them signed.  */
    5209          730 :                   mask = ix86_build_signbit_mask (mode, true, false);
    5210          730 :                   t1 = gen_reg_rtx (mode);
    5211          730 :                   emit_insn (gen_sub3_insn (t1, cop0, mask));
    5212              : 
    5213          730 :                   t2 = gen_reg_rtx (mode);
    5214          730 :                   emit_insn (gen_sub3_insn (t2, cop1, mask));
    5215              : 
    5216          730 :                   cop0 = t1;
    5217          730 :                   cop1 = t2;
    5218          730 :                   code = GT;
    5219              :                 }
    5220          730 :               break;
    5221              : 
    5222          347 :             case E_V64QImode:
    5223          347 :             case E_V32HImode:
    5224          347 :             case E_V32QImode:
    5225          347 :             case E_V16HImode:
    5226          347 :             case E_V16QImode:
    5227          347 :             case E_V8QImode:
    5228          347 :             case E_V4QImode:
    5229          347 :             case E_V2QImode:
    5230          347 :             case E_V8HImode:
    5231          347 :             case E_V4HImode:
    5232          347 :             case E_V2HImode:
    5233              :               /* Perform a parallel unsigned saturating subtraction.  */
    5234          347 :               x = gen_reg_rtx (mode);
    5235          347 :               emit_insn (gen_rtx_SET
    5236              :                          (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
    5237          347 :               cop0 = x;
    5238          347 :               cop1 = CONST0_RTX (mode);
    5239          347 :               code = EQ;
    5240          347 :               *negate = !*negate;
    5241          347 :               break;
    5242              : 
    5243            0 :             default:
    5244            0 :               gcc_unreachable ();
    5245              :             }
    5246              :         }
    5247              :     }
    5248              : 
    5249        16650 :   if (*negate)
    5250         3812 :     std::swap (op_true, op_false);
    5251              : 
    5252        16650 :   if (CONST_VECTOR_P (cop1))
    5253          416 :     cop1 = force_reg (mode, cop1);
    5254              : 
    5255              :   /* Allow the comparison to be done in one mode, but the movcc to
    5256              :      happen in another mode.  */
    5257        16650 :   if (data_mode == mode)
    5258        16608 :     x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
    5259              :   else
    5260              :     {
    5261          126 :       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
    5262           42 :       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
    5263              :                                op_true, op_false);
    5264           42 :       if (GET_MODE (x) == mode)
    5265           24 :         x = gen_lowpart (data_mode, x);
    5266              :     }
    5267              : 
    5268              :   return x;
    5269              : }
    5270              : 
    5271              : /* Expand integer vector comparison.  */
    5272              : 
    5273              : bool
    5274         9592 : ix86_expand_int_vec_cmp (rtx operands[])
    5275              : {
    5276         9592 :   rtx_code code = GET_CODE (operands[1]);
    5277         9592 :   bool negate = false;
    5278         9592 :   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
    5279              :                                      operands[3], NULL, NULL, &negate);
    5280              : 
    5281         9592 :   if (!cmp)
    5282              :     return false;
    5283              : 
    5284         9592 :   if (negate)
    5285         3716 :     cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
    5286         3716 :                                    CONST0_RTX (GET_MODE (cmp)),
    5287              :                                    NULL, NULL, &negate);
    5288              : 
    5289         9592 :   gcc_assert (!negate);
    5290              : 
    5291         9592 :   if (operands[0] != cmp)
    5292         9298 :     emit_move_insn (operands[0], cmp);
    5293              : 
    5294              :   return true;
    5295              : }
    5296              : 
    5297              : /* Expand a floating-point vector conditional move; a vcond operation
    5298              :    rather than a movcc operation.  */
    5299              : 
    5300              : bool
    5301            0 : ix86_expand_fp_vcond (rtx operands[])
    5302              : {
    5303            0 :   enum rtx_code code = GET_CODE (operands[3]);
    5304            0 :   rtx cmp;
    5305              : 
    5306            0 :   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
    5307              :                                            &operands[4], &operands[5]);
    5308            0 :   if (code == UNKNOWN)
    5309              :     {
    5310            0 :       rtx temp;
    5311            0 :       switch (GET_CODE (operands[3]))
    5312              :         {
    5313            0 :         case LTGT:
    5314            0 :           temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
    5315              :                                       operands[5], operands[0], operands[0]);
    5316            0 :           cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
    5317              :                                      operands[5], operands[1], operands[2]);
    5318            0 :           code = AND;
    5319            0 :           break;
    5320            0 :         case UNEQ:
    5321            0 :           temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
    5322              :                                       operands[5], operands[0], operands[0]);
    5323            0 :           cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
    5324              :                                      operands[5], operands[1], operands[2]);
    5325            0 :           code = IOR;
    5326            0 :           break;
    5327            0 :         default:
    5328            0 :           gcc_unreachable ();
    5329              :         }
    5330            0 :       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
    5331              :                                  OPTAB_DIRECT);
    5332            0 :       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
    5333            0 :       return true;
    5334              :     }
    5335              : 
    5336            0 :   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
    5337              :                                  operands[5], operands[1], operands[2]))
    5338              :     return true;
    5339              : 
    5340            0 :   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
    5341              :                              operands[1], operands[2]);
    5342            0 :   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
    5343            0 :   return true;
    5344              : }
    5345              : 
    5346              : /* Expand a signed/unsigned integral vector conditional move.  */
    5347              : 
    5348              : bool
    5349         3342 : ix86_expand_int_vcond (rtx operands[])
    5350              : {
    5351         3342 :   machine_mode data_mode = GET_MODE (operands[0]);
    5352         3342 :   machine_mode mode = GET_MODE (operands[4]);
    5353         3342 :   enum rtx_code code = GET_CODE (operands[3]);
    5354         3342 :   bool negate = false;
    5355         3342 :   rtx x, cop0, cop1;
    5356              : 
    5357         3342 :   cop0 = operands[4];
    5358         3342 :   cop1 = operands[5];
    5359              : 
    5360              :   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
    5361              :      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
    5362         3342 :   if ((code == LT || code == GE)
    5363            0 :       && data_mode == mode
    5364            0 :       && cop1 == CONST0_RTX (mode)
    5365            0 :       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
    5366            0 :       && GET_MODE_UNIT_SIZE (data_mode) > 1
    5367            0 :       && GET_MODE_UNIT_SIZE (data_mode) <= 8
    5368         3342 :       && (GET_MODE_SIZE (data_mode) == 16
    5369            0 :           || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
    5370              :     {
    5371            0 :       rtx negop = operands[2 - (code == LT)];
    5372            0 :       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
    5373            0 :       if (negop == CONST1_RTX (data_mode))
    5374              :         {
    5375            0 :           rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
    5376              :                                          operands[0], 1, OPTAB_DIRECT);
    5377            0 :           if (res != operands[0])
    5378            0 :             emit_move_insn (operands[0], res);
    5379            0 :           return true;
    5380              :         }
    5381            0 :       else if (GET_MODE_INNER (data_mode) != DImode
    5382            0 :                && vector_all_ones_operand (negop, data_mode))
    5383              :         {
    5384            0 :           rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
    5385              :                                          operands[0], 0, OPTAB_DIRECT);
    5386            0 :           if (res != operands[0])
    5387            0 :             emit_move_insn (operands[0], res);
    5388            0 :           return true;
    5389              :         }
    5390              :     }
    5391              : 
    5392         3342 :   if (!nonimmediate_operand (cop1, mode))
    5393          126 :     cop1 = force_reg (mode, cop1);
    5394         3342 :   if (!general_operand (operands[1], data_mode))
    5395            0 :     operands[1] = force_reg (data_mode, operands[1]);
    5396         3342 :   if (!general_operand (operands[2], data_mode))
    5397            0 :     operands[2] = force_reg (data_mode, operands[2]);
    5398              : 
    5399         3342 :   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
    5400              :                                operands[1], operands[2], &negate);
    5401              : 
    5402         3342 :   if (!x)
    5403              :     return false;
    5404              : 
    5405         3342 :   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
    5406         3342 :                          operands[2-negate]);
    5407         3342 :   return true;
    5408              : }
    5409              : 
    5410              : static bool
    5411       124953 : ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
    5412              :                               struct expand_vec_perm_d *d)
    5413              : {
    5414              :   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
    5415              :      expander, so args are either in d, or in op0, op1 etc.  */
    5416       124953 :   machine_mode mode = GET_MODE (d ? d->op0 : op0);
    5417       124953 :   machine_mode maskmode = mode;
    5418       124953 :   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
    5419              : 
    5420       124953 :   switch (mode)
    5421              :     {
    5422        23251 :     case E_V16QImode:
    5423        23251 :       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
    5424              :         gen = gen_avx512vl_vpermt2varv16qi3;
    5425              :       break;
    5426          750 :     case E_V32QImode:
    5427          750 :       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
    5428              :         gen = gen_avx512vl_vpermt2varv32qi3;
    5429              :       break;
    5430          235 :     case E_V64QImode:
    5431          235 :       if (TARGET_AVX512VBMI)
    5432              :         gen = gen_avx512bw_vpermt2varv64qi3;
    5433              :       break;
    5434        13011 :     case E_V8HImode:
    5435        13011 :       if (TARGET_AVX512VL && TARGET_AVX512BW)
    5436              :         gen = gen_avx512vl_vpermt2varv8hi3;
    5437              :       break;
    5438          758 :     case E_V16HImode:
    5439          758 :       if (TARGET_AVX512VL && TARGET_AVX512BW)
    5440              :         gen = gen_avx512vl_vpermt2varv16hi3;
    5441              :       break;
    5442          391 :     case E_V32HImode:
    5443          391 :       if (TARGET_AVX512BW)
    5444              :         gen = gen_avx512bw_vpermt2varv32hi3;
    5445              :       break;
    5446        34960 :     case E_V4SImode:
    5447        34960 :       if (TARGET_AVX512VL)
    5448              :         gen = gen_avx512vl_vpermt2varv4si3;
    5449              :       break;
    5450         1171 :     case E_V8SImode:
    5451         1171 :       if (TARGET_AVX512VL)
    5452              :         gen = gen_avx512vl_vpermt2varv8si3;
    5453              :       break;
    5454          126 :     case E_V16SImode:
    5455          126 :       if (TARGET_AVX512F)
    5456              :         gen = gen_avx512f_vpermt2varv16si3;
    5457              :       break;
    5458        10330 :     case E_V4SFmode:
    5459        10330 :       if (TARGET_AVX512VL)
    5460              :         {
    5461              :           gen = gen_avx512vl_vpermt2varv4sf3;
    5462              :           maskmode = V4SImode;
    5463              :         }
    5464              :       break;
    5465         7647 :     case E_V8SFmode:
    5466         7647 :       if (TARGET_AVX512VL)
    5467              :         {
    5468              :           gen = gen_avx512vl_vpermt2varv8sf3;
    5469              :           maskmode = V8SImode;
    5470              :         }
    5471              :       break;
    5472          239 :     case E_V16SFmode:
    5473          239 :       if (TARGET_AVX512F)
    5474              :         {
    5475              :           gen = gen_avx512f_vpermt2varv16sf3;
    5476              :           maskmode = V16SImode;
    5477              :         }
    5478              :       break;
    5479            0 :     case E_V2DImode:
    5480            0 :       if (TARGET_AVX512VL)
    5481              :         gen = gen_avx512vl_vpermt2varv2di3;
    5482              :       break;
    5483          290 :     case E_V4DImode:
    5484          290 :       if (TARGET_AVX512VL)
    5485              :         gen = gen_avx512vl_vpermt2varv4di3;
    5486              :       break;
    5487           10 :     case E_V8DImode:
    5488           10 :       if (TARGET_AVX512F)
    5489              :         gen = gen_avx512f_vpermt2varv8di3;
    5490              :       break;
    5491            0 :     case E_V2DFmode:
    5492            0 :       if (TARGET_AVX512VL)
    5493              :         {
    5494              :           gen = gen_avx512vl_vpermt2varv2df3;
    5495              :           maskmode = V2DImode;
    5496              :         }
    5497              :       break;
    5498         1960 :     case E_V4DFmode:
    5499         1960 :       if (TARGET_AVX512VL)
    5500              :         {
    5501              :           gen = gen_avx512vl_vpermt2varv4df3;
    5502              :           maskmode = V4DImode;
    5503              :         }
    5504              :       break;
    5505          202 :     case E_V8DFmode:
    5506          202 :       if (TARGET_AVX512F)
    5507              :         {
    5508              :           gen = gen_avx512f_vpermt2varv8df3;
    5509              :           maskmode = V8DImode;
    5510              :         }
    5511              :       break;
    5512              :     default:
    5513              :       break;
    5514              :     }
    5515              : 
    5516              :   if (gen == NULL)
    5517              :     return false;
    5518              : 
    5519          964 :   if (d && d->testing_p)
    5520              :     return true;
    5521              : 
    5522              :   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
    5523              :      expander, so args are either in d, or in op0, op1 etc.  */
    5524          953 :   if (d)
    5525              :     {
    5526          953 :       rtx vec[64];
    5527          953 :       target = d->target;
    5528          953 :       op0 = d->op0;
    5529          953 :       op1 = d->op1;
    5530        17421 :       for (int i = 0; i < d->nelt; ++i)
    5531        16468 :         vec[i] = GEN_INT (d->perm[i]);
    5532          953 :       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
    5533              :     }
    5534              : 
    5535          961 :   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
    5536          961 :   return true;
    5537              : }
    5538              : 
    5539              : /* Expand a variable vector permutation.  */
    5540              : 
    5541              : void
    5542           10 : ix86_expand_vec_perm (rtx operands[])
    5543              : {
    5544           10 :   rtx target = operands[0];
    5545           10 :   rtx op0 = operands[1];
    5546           10 :   rtx op1 = operands[2];
    5547           10 :   rtx mask = operands[3];
    5548           10 :   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
    5549           10 :   machine_mode mode = GET_MODE (op0);
    5550           10 :   machine_mode maskmode = GET_MODE (mask);
    5551           10 :   int w, e, i;
    5552           10 :   bool one_operand_shuffle = rtx_equal_p (op0, op1);
    5553              : 
    5554              :   /* Number of elements in the vector.  */
    5555           10 :   w = GET_MODE_NUNITS (mode);
    5556           10 :   e = GET_MODE_UNIT_SIZE (mode);
    5557           10 :   gcc_assert (w <= 64);
    5558              : 
    5559              :   /* For HF mode vector, convert it to HI using subreg.  */
    5560           20 :   if (GET_MODE_INNER (mode) == HFmode)
    5561              :     {
    5562            6 :       machine_mode orig_mode = mode;
    5563            6 :       mode = mode_for_vector (HImode, w).require ();
    5564            6 :       target = lowpart_subreg (mode, target, orig_mode);
    5565            6 :       op0 = lowpart_subreg (mode, op0, orig_mode);
    5566            6 :       op1 = lowpart_subreg (mode, op1, orig_mode);
    5567              :     }
    5568              : 
    5569           10 :   if (TARGET_AVX512F && one_operand_shuffle)
    5570              :     {
    5571            5 :       rtx (*gen) (rtx, rtx, rtx) = NULL;
    5572            5 :       switch (mode)
    5573              :         {
    5574              :         case E_V16SImode:
    5575              :           gen =gen_avx512f_permvarv16si;
    5576              :           break;
    5577            0 :         case E_V16SFmode:
    5578            0 :           gen = gen_avx512f_permvarv16sf;
    5579            0 :           break;
    5580            0 :         case E_V8DImode:
    5581            0 :           gen = gen_avx512f_permvarv8di;
    5582            0 :           break;
    5583            0 :         case E_V8DFmode:
    5584            0 :           gen = gen_avx512f_permvarv8df;
    5585            0 :           break;
    5586              :         default:
    5587              :           break;
    5588              :         }
    5589            0 :       if (gen != NULL)
    5590              :         {
    5591            0 :           emit_insn (gen (target, op0, mask));
    5592            8 :           return;
    5593              :         }
    5594              :     }
    5595              : 
    5596           10 :   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
    5597              :     return;
    5598              : 
    5599            2 :   if (TARGET_AVX2)
    5600              :     {
    5601            1 :       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
    5602              :         {
    5603              :           /* Unfortunately, the VPERMQ and VPERMPD instructions only support
    5604              :              an constant shuffle operand.  With a tiny bit of effort we can
    5605              :              use VPERMD instead.  A re-interpretation stall for V4DFmode is
    5606              :              unfortunate but there's no avoiding it.
    5607              :              Similarly for V16HImode we don't have instructions for variable
    5608              :              shuffling, while for V32QImode we can use after preparing suitable
    5609              :              masks vpshufb; vpshufb; vpermq; vpor.  */
    5610              : 
    5611              :           if (mode == V16HImode)
    5612              :             {
    5613              :               maskmode = mode = V32QImode;
    5614              :               w = 32;
    5615              :               e = 1;
    5616              :             }
    5617              :           else
    5618              :             {
    5619              :               maskmode = mode = V8SImode;
    5620              :               w = 8;
    5621              :               e = 4;
    5622              :             }
    5623            0 :           t1 = gen_reg_rtx (maskmode);
    5624              : 
    5625              :           /* Replicate the low bits of the V4DImode mask into V8SImode:
    5626              :                mask = { A B C D }
    5627              :                t1 = { A A B B C C D D }.  */
    5628            0 :           for (i = 0; i < w / 2; ++i)
    5629            0 :             vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
    5630            0 :           vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
    5631            0 :           vt = force_reg (maskmode, vt);
    5632            0 :           mask = gen_lowpart (maskmode, mask);
    5633            0 :           if (maskmode == V8SImode)
    5634            0 :             emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
    5635              :           else
    5636            0 :             emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
    5637              : 
    5638              :           /* Multiply the shuffle indicies by two.  */
    5639            0 :           t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
    5640              :                                     OPTAB_DIRECT);
    5641              : 
    5642              :           /* Add one to the odd shuffle indicies:
    5643              :                 t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
    5644            0 :           for (i = 0; i < w / 2; ++i)
    5645              :             {
    5646            0 :               vec[i * 2] = const0_rtx;
    5647            0 :               vec[i * 2 + 1] = const1_rtx;
    5648              :             }
    5649            0 :           vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
    5650            0 :           vt = validize_mem (force_const_mem (maskmode, vt));
    5651            0 :           t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
    5652              :                                     OPTAB_DIRECT);
    5653              : 
    5654              :           /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
    5655            0 :           operands[3] = mask = t1;
    5656            0 :           target = gen_reg_rtx (mode);
    5657            0 :           op0 = gen_lowpart (mode, op0);
    5658            0 :           op1 = gen_lowpart (mode, op1);
    5659              :         }
    5660              : 
    5661            1 :       switch (mode)
    5662              :         {
    5663            1 :         case E_V8SImode:
    5664              :           /* The VPERMD and VPERMPS instructions already properly ignore
    5665              :              the high bits of the shuffle elements.  No need for us to
    5666              :              perform an AND ourselves.  */
    5667            1 :           if (one_operand_shuffle)
    5668              :             {
    5669            0 :               emit_insn (gen_avx2_permvarv8si (target, op0, mask));
    5670            0 :               if (target != operands[0])
    5671            0 :                 emit_move_insn (operands[0],
    5672            0 :                                 gen_lowpart (GET_MODE (operands[0]), target));
    5673              :             }
    5674              :           else
    5675              :             {
    5676            1 :               t1 = gen_reg_rtx (V8SImode);
    5677            1 :               t2 = gen_reg_rtx (V8SImode);
    5678            1 :               emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
    5679            1 :               emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
    5680            1 :               goto merge_two;
    5681              :             }
    5682            0 :           return;
    5683              : 
    5684            0 :         case E_V8SFmode:
    5685            0 :           mask = gen_lowpart (V8SImode, mask);
    5686            0 :           if (one_operand_shuffle)
    5687            0 :             emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
    5688              :           else
    5689              :             {
    5690            0 :               t1 = gen_reg_rtx (V8SFmode);
    5691            0 :               t2 = gen_reg_rtx (V8SFmode);
    5692            0 :               emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
    5693            0 :               emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
    5694            0 :               goto merge_two;
    5695              :             }
    5696            0 :           return;
    5697              : 
    5698            0 :         case E_V4SImode:
    5699              :           /* By combining the two 128-bit input vectors into one 256-bit
    5700              :              input vector, we can use VPERMD and VPERMPS for the full
    5701              :              two-operand shuffle.  */
    5702            0 :           t1 = gen_reg_rtx (V8SImode);
    5703            0 :           t2 = gen_reg_rtx (V8SImode);
    5704            0 :           emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
    5705            0 :           emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
    5706            0 :           emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
    5707            0 :           emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
    5708            0 :           return;
    5709              : 
    5710            0 :         case E_V4SFmode:
    5711            0 :           t1 = gen_reg_rtx (V8SFmode);
    5712            0 :           t2 = gen_reg_rtx (V8SImode);
    5713            0 :           mask = gen_lowpart (V4SImode, mask);
    5714            0 :           emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
    5715            0 :           emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
    5716            0 :           emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
    5717            0 :           emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
    5718            0 :           return;
    5719              : 
    5720            0 :         case E_V32QImode:
    5721            0 :           t1 = gen_reg_rtx (V32QImode);
    5722            0 :           t2 = gen_reg_rtx (V32QImode);
    5723            0 :           t3 = gen_reg_rtx (V32QImode);
    5724            0 :           vt2 = GEN_INT (-128);
    5725            0 :           vt = gen_const_vec_duplicate (V32QImode, vt2);
    5726            0 :           vt = force_reg (V32QImode, vt);
    5727            0 :           for (i = 0; i < 32; i++)
    5728            0 :             vec[i] = i < 16 ? vt2 : const0_rtx;
    5729            0 :           vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
    5730            0 :           vt2 = force_reg (V32QImode, vt2);
    5731              :           /* From mask create two adjusted masks, which contain the same
    5732              :              bits as mask in the low 7 bits of each vector element.
    5733              :              The first mask will have the most significant bit clear
    5734              :              if it requests element from the same 128-bit lane
    5735              :              and MSB set if it requests element from the other 128-bit lane.
    5736              :              The second mask will have the opposite values of the MSB,
    5737              :              and additionally will have its 128-bit lanes swapped.
    5738              :              E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
    5739              :              t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
    5740              :              t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
    5741              :              stands for other 12 bytes.  */
    5742              :           /* The bit whether element is from the same lane or the other
    5743              :              lane is bit 4, so shift it up by 3 to the MSB position.  */
    5744            0 :           t5 = gen_reg_rtx (V4DImode);
    5745            0 :           emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
    5746              :                                     GEN_INT (3)));
    5747              :           /* Clear MSB bits from the mask just in case it had them set.  */
    5748            0 :           emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
    5749              :           /* After this t1 will have MSB set for elements from other lane.  */
    5750            0 :           emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
    5751              :           /* Clear bits other than MSB.  */
    5752            0 :           emit_insn (gen_andv32qi3 (t1, t1, vt));
    5753              :           /* Or in the lower bits from mask into t3.  */
    5754            0 :           emit_insn (gen_iorv32qi3 (t3, t1, t2));
    5755              :           /* And invert MSB bits in t1, so MSB is set for elements from the same
    5756              :              lane.  */
    5757            0 :           emit_insn (gen_xorv32qi3 (t1, t1, vt));
    5758              :           /* Swap 128-bit lanes in t3.  */
    5759            0 :           t6 = gen_reg_rtx (V4DImode);
    5760            0 :           emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
    5761              :                                           const2_rtx, GEN_INT (3),
    5762              :                                           const0_rtx, const1_rtx));
    5763              :           /* And or in the lower bits from mask into t1.  */
    5764            0 :           emit_insn (gen_iorv32qi3 (t1, t1, t2));
    5765            0 :           if (one_operand_shuffle)
    5766              :             {
    5767              :               /* Each of these shuffles will put 0s in places where
    5768              :                  element from the other 128-bit lane is needed, otherwise
    5769              :                  will shuffle in the requested value.  */
    5770            0 :               emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
    5771            0 :                                                 gen_lowpart (V32QImode, t6)));
    5772            0 :               emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
    5773              :               /* For t3 the 128-bit lanes are swapped again.  */
    5774            0 :               t7 = gen_reg_rtx (V4DImode);
    5775            0 :               emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
    5776              :                                               const2_rtx, GEN_INT (3),
    5777              :                                               const0_rtx, const1_rtx));
    5778              :               /* And oring both together leads to the result.  */
    5779            0 :               emit_insn (gen_iorv32qi3 (target, t1,
    5780            0 :                                         gen_lowpart (V32QImode, t7)));
    5781            0 :               if (target != operands[0])
    5782            0 :                 emit_move_insn (operands[0],
    5783            0 :                                 gen_lowpart (GET_MODE (operands[0]), target));
    5784            0 :               return;
    5785              :             }
    5786              : 
    5787            0 :           t4 = gen_reg_rtx (V32QImode);
    5788              :           /* Similarly to the above one_operand_shuffle code,
    5789              :              just for repeated twice for each operand.  merge_two:
    5790              :              code will merge the two results together.  */
    5791            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
    5792            0 :                                             gen_lowpart (V32QImode, t6)));
    5793            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
    5794            0 :                                             gen_lowpart (V32QImode, t6)));
    5795            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
    5796            0 :           emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
    5797            0 :           t7 = gen_reg_rtx (V4DImode);
    5798            0 :           emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
    5799              :                                           const2_rtx, GEN_INT (3),
    5800              :                                           const0_rtx, const1_rtx));
    5801            0 :           t8 = gen_reg_rtx (V4DImode);
    5802            0 :           emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
    5803              :                                           const2_rtx, GEN_INT (3),
    5804              :                                           const0_rtx, const1_rtx));
    5805            0 :           emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
    5806            0 :           emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
    5807            0 :           t1 = t4;
    5808            0 :           t2 = t3;
    5809            0 :           goto merge_two;
    5810              : 
    5811            0 :         default:
    5812            0 :           gcc_assert (GET_MODE_SIZE (mode) <= 16);
    5813              :           break;
    5814              :         }
    5815              :     }
    5816              : 
    5817            1 :   if (TARGET_XOP)
    5818              :     {
    5819              :       /* The XOP VPPERM insn supports three inputs.  By ignoring the
    5820              :          one_operand_shuffle special case, we avoid creating another
    5821              :          set of constant vectors in memory.  */
    5822            0 :       one_operand_shuffle = false;
    5823              : 
    5824              :       /* mask = mask & {2*w-1, ...} */
    5825            0 :       vt = GEN_INT (2*w - 1);
    5826              :     }
    5827              :   else
    5828              :     {
    5829              :       /* mask = mask & {w-1, ...} */
    5830            1 :       vt = GEN_INT (w - 1);
    5831              :     }
    5832              : 
    5833            1 :   vt = gen_const_vec_duplicate (maskmode, vt);
    5834            1 :   mask = expand_simple_binop (maskmode, AND, mask, vt,
    5835              :                               NULL_RTX, 0, OPTAB_DIRECT);
    5836              : 
    5837              :   /* For non-QImode operations, convert the word permutation control
    5838              :      into a byte permutation control.  */
    5839            1 :   if (mode != V16QImode)
    5840              :     {
    5841            1 :       mask = expand_simple_binop (maskmode, ASHIFT, mask,
    5842            2 :                                   GEN_INT (exact_log2 (e)),
    5843              :                                   NULL_RTX, 0, OPTAB_DIRECT);
    5844              : 
    5845              :       /* Convert mask to vector of chars.  */
    5846            1 :       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
    5847              : 
    5848              :       /* Replicate each of the input bytes into byte positions:
    5849              :          (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
    5850              :          (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
    5851              :          (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
    5852           18 :       for (i = 0; i < 16; ++i)
    5853           16 :         vec[i] = GEN_INT (i/e * e);
    5854            1 :       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
    5855            1 :       vt = validize_mem (force_const_mem (V16QImode, vt));
    5856            1 :       if (TARGET_XOP)
    5857            0 :         emit_insn (gen_xop_pperm (mask, mask, mask, vt));
    5858              :       else
    5859            1 :         emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
    5860              : 
    5861              :       /* Convert it into the byte positions by doing
    5862              :          mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
    5863           17 :       for (i = 0; i < 16; ++i)
    5864           16 :         vec[i] = GEN_INT (i % e);
    5865            1 :       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
    5866            1 :       vt = validize_mem (force_const_mem (V16QImode, vt));
    5867            1 :       emit_insn (gen_addv16qi3 (mask, mask, vt));
    5868              :     }
    5869              : 
    5870              :   /* The actual shuffle operations all operate on V16QImode.  */
    5871            1 :   op0 = gen_lowpart (V16QImode, op0);
    5872            1 :   op1 = gen_lowpart (V16QImode, op1);
    5873              : 
    5874            1 :   if (TARGET_XOP)
    5875              :     {
    5876            0 :       if (GET_MODE (target) != V16QImode)
    5877            0 :         target = gen_reg_rtx (V16QImode);
    5878            0 :       emit_insn (gen_xop_pperm (target, op0, op1, mask));
    5879            0 :       if (target != operands[0])
    5880            0 :         emit_move_insn (operands[0],
    5881            0 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5882              :     }
    5883            1 :   else if (one_operand_shuffle)
    5884              :     {
    5885            1 :       if (GET_MODE (target) != V16QImode)
    5886            1 :         target = gen_reg_rtx (V16QImode);
    5887            1 :       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
    5888            1 :       if (target != operands[0])
    5889            1 :         emit_move_insn (operands[0],
    5890            1 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5891              :     }
    5892              :   else
    5893              :     {
    5894            0 :       rtx xops[6];
    5895            0 :       bool ok;
    5896              : 
    5897              :       /* Shuffle the two input vectors independently.  */
    5898            0 :       t1 = gen_reg_rtx (V16QImode);
    5899            0 :       t2 = gen_reg_rtx (V16QImode);
    5900            0 :       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
    5901            0 :       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
    5902              : 
    5903            1 :  merge_two:
    5904              :       /* Then merge them together.  The key is whether any given control
    5905              :          element contained a bit set that indicates the second word.  */
    5906            1 :       mask = operands[3];
    5907            1 :       vt = GEN_INT (w);
    5908            1 :       if (maskmode == V2DImode && !TARGET_SSE4_1)
    5909              :         {
    5910              :           /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
    5911              :              more shuffle to convert the V2DI input mask into a V4SI
    5912              :              input mask.  At which point the masking that expand_int_vcond
    5913              :              will work as desired.  */
    5914            0 :           rtx t3 = gen_reg_rtx (V4SImode);
    5915            0 :           emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
    5916              :                                         const0_rtx, const0_rtx,
    5917              :                                         const2_rtx, const2_rtx));
    5918            0 :           mask = t3;
    5919            0 :           maskmode = V4SImode;
    5920            0 :           e = w = 4;
    5921              :         }
    5922              : 
    5923            1 :       vt = gen_const_vec_duplicate (maskmode, vt);
    5924            1 :       vt = force_reg (maskmode, vt);
    5925            1 :       mask = expand_simple_binop (maskmode, AND, mask, vt,
    5926              :                                   NULL_RTX, 0, OPTAB_DIRECT);
    5927              : 
    5928            1 :       if (GET_MODE (target) != mode)
    5929            0 :         target = gen_reg_rtx (mode);
    5930            1 :       xops[0] = target;
    5931            1 :       xops[1] = gen_lowpart (mode, t2);
    5932            1 :       xops[2] = gen_lowpart (mode, t1);
    5933            1 :       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
    5934            1 :       xops[4] = mask;
    5935            1 :       xops[5] = vt;
    5936            1 :       ok = ix86_expand_int_vcond (xops);
    5937            1 :       gcc_assert (ok);
    5938            1 :       if (target != operands[0])
    5939            0 :         emit_move_insn (operands[0],
    5940            0 :                         gen_lowpart (GET_MODE (operands[0]), target));
    5941              :     }
    5942              : }
    5943              : 
    5944              : /* Extend SRC into next wider integer vector type.  UNSIGNED_P is
    5945              :    true if we should do zero extension, else sign extension.  */
    5946              : 
    5947              : void
    5948          290 : ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
    5949              : {
    5950          290 :   machine_mode imode = GET_MODE (src);
    5951          290 :   rtx ops[3];
    5952              : 
    5953          290 :   switch (imode)
    5954              :     {
    5955          290 :     case E_V8QImode:
    5956          290 :     case E_V4QImode:
    5957          290 :     case E_V2QImode:
    5958          290 :     case E_V4HImode:
    5959          290 :     case E_V2HImode:
    5960          290 :     case E_V2SImode:
    5961          290 :       break;
    5962            0 :     default:
    5963            0 :       gcc_unreachable ();
    5964              :     }
    5965              : 
    5966          290 :   ops[0] = dest;
    5967              : 
    5968          290 :   ops[1] = force_reg (imode, src);
    5969              : 
    5970          290 :   if (unsigned_p)
    5971           92 :     ops[2] = force_reg (imode, CONST0_RTX (imode));
    5972              :   else
    5973          198 :     ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
    5974              :                                   ops[1], pc_rtx, pc_rtx);
    5975              : 
    5976          290 :   ix86_split_mmx_punpck (ops, false);
    5977          290 : }
    5978              : 
    5979              : /* Unpack SRC into the next wider integer vector type.  UNSIGNED_P is
    5980              :    true if we should do zero extension, else sign extension.  HIGH_P is
    5981              :    true if we want the N/2 high elements, else the low elements.  */
    5982              : 
    5983              : void
    5984        18245 : ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
    5985              : {
    5986        18245 :   machine_mode imode = GET_MODE (src);
    5987        18245 :   rtx tmp;
    5988              : 
    5989        18245 :   if (TARGET_SSE4_1)
    5990              :     {
    5991         6313 :       rtx (*unpack)(rtx, rtx);
    5992         6313 :       rtx (*extract)(rtx, rtx) = NULL;
    5993         6313 :       machine_mode halfmode = BLKmode;
    5994              : 
    5995         6313 :       switch (imode)
    5996              :         {
    5997          116 :         case E_V64QImode:
    5998          116 :           if (unsigned_p)
    5999              :             unpack = gen_avx512bw_zero_extendv32qiv32hi2;
    6000              :           else
    6001           62 :             unpack = gen_avx512bw_sign_extendv32qiv32hi2;
    6002          116 :           halfmode = V32QImode;
    6003          116 :           extract
    6004          116 :             = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
    6005              :           break;
    6006          697 :         case E_V32QImode:
    6007          697 :           if (unsigned_p)
    6008              :             unpack = gen_avx2_zero_extendv16qiv16hi2;
    6009              :           else
    6010          150 :             unpack = gen_avx2_sign_extendv16qiv16hi2;
    6011          697 :           halfmode = V16QImode;
    6012          697 :           extract
    6013          697 :             = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
    6014              :           break;
    6015          102 :         case E_V32HImode:
    6016          102 :           if (unsigned_p)
    6017              :             unpack = gen_avx512f_zero_extendv16hiv16si2;
    6018              :           else
    6019           60 :             unpack = gen_avx512f_sign_extendv16hiv16si2;
    6020          102 :           halfmode = V16HImode;
    6021          102 :           extract
    6022          102 :             = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
    6023              :           break;
    6024          455 :         case E_V16HImode:
    6025          455 :           if (unsigned_p)
    6026              :             unpack = gen_avx2_zero_extendv8hiv8si2;
    6027              :           else
    6028          332 :             unpack = gen_avx2_sign_extendv8hiv8si2;
    6029          455 :           halfmode = V8HImode;
    6030          455 :           extract
    6031          455 :             = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
    6032              :           break;
    6033          108 :         case E_V16SImode:
    6034          108 :           if (unsigned_p)
    6035              :             unpack = gen_avx512f_zero_extendv8siv8di2;
    6036              :           else
    6037           90 :             unpack = gen_avx512f_sign_extendv8siv8di2;
    6038          108 :           halfmode = V8SImode;
    6039          108 :           extract
    6040          108 :             = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
    6041              :           break;
    6042          396 :         case E_V8SImode:
    6043          396 :           if (unsigned_p)
    6044              :             unpack = gen_avx2_zero_extendv4siv4di2;
    6045              :           else
    6046          334 :             unpack = gen_avx2_sign_extendv4siv4di2;
    6047          396 :           halfmode = V4SImode;
    6048          396 :           extract
    6049          396 :             = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
    6050              :           break;
    6051         2558 :         case E_V16QImode:
    6052         2558 :           if (unsigned_p)
    6053              :             unpack = gen_sse4_1_zero_extendv8qiv8hi2;
    6054              :           else
    6055          257 :             unpack = gen_sse4_1_sign_extendv8qiv8hi2;
    6056              :           break;
    6057          963 :         case E_V8HImode:
    6058          963 :           if (unsigned_p)
    6059              :             unpack = gen_sse4_1_zero_extendv4hiv4si2;
    6060              :           else
    6061          750 :             unpack = gen_sse4_1_sign_extendv4hiv4si2;
    6062              :           break;
    6063          538 :         case E_V4SImode:
    6064          538 :           if (unsigned_p)
    6065              :             unpack = gen_sse4_1_zero_extendv2siv2di2;
    6066              :           else
    6067          478 :             unpack = gen_sse4_1_sign_extendv2siv2di2;
    6068              :           break;
    6069          111 :         case E_V8QImode:
    6070          111 :           if (unsigned_p)
    6071              :             unpack = gen_sse4_1_zero_extendv4qiv4hi2;
    6072              :           else
    6073           72 :             unpack = gen_sse4_1_sign_extendv4qiv4hi2;
    6074              :           break;
    6075          263 :         case E_V4HImode:
    6076          263 :           if (unsigned_p)
    6077              :             unpack = gen_sse4_1_zero_extendv2hiv2si2;
    6078              :           else
    6079          208 :             unpack = gen_sse4_1_sign_extendv2hiv2si2;
    6080              :           break;
    6081            6 :         case E_V4QImode:
    6082            6 :           if (unsigned_p)
    6083              :             unpack = gen_sse4_1_zero_extendv2qiv2hi2;
    6084              :           else
    6085            0 :             unpack = gen_sse4_1_sign_extendv2qiv2hi2;
    6086              :           break;
    6087            0 :         default:
    6088            0 :           gcc_unreachable ();
    6089              :         }
    6090              : 
    6091        12626 :       if (GET_MODE_SIZE (imode) >= 32)
    6092              :         {
    6093         1874 :           tmp = gen_reg_rtx (halfmode);
    6094         1874 :           emit_insn (extract (tmp, src));
    6095              :         }
    6096         4439 :       else if (high_p)
    6097              :         {
    6098         2280 :           switch (GET_MODE_SIZE (imode))
    6099              :             {
    6100          947 :             case 16:
    6101              :               /* Shift higher 8 bytes to lower 8 bytes.  */
    6102          947 :               tmp = gen_reg_rtx (V1TImode);
    6103          947 :               emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
    6104              :                                              GEN_INT (64)));
    6105          947 :               break;
    6106          190 :             case 8:
    6107              :               /* Shift higher 4 bytes to lower 4 bytes.  */
    6108          190 :               tmp = gen_reg_rtx (V1DImode);
    6109          190 :               emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
    6110              :                                             GEN_INT (32)));
    6111          190 :               break;
    6112            3 :             case 4:
    6113              :               /* Shift higher 2 bytes to lower 2 bytes.  */
    6114            3 :               tmp = gen_reg_rtx (V1SImode);
    6115            3 :               emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
    6116              :                                             GEN_INT (16)));
    6117            3 :               break;
    6118            0 :             default:
    6119            0 :               gcc_unreachable ();
    6120              :             }
    6121              : 
    6122         1140 :           tmp = gen_lowpart (imode, tmp);
    6123              :         }
    6124              :       else
    6125              :         tmp = src;
    6126              : 
    6127         6313 :       emit_insn (unpack (dest, tmp));
    6128              :     }
    6129              :   else
    6130              :     {
    6131        11932 :       rtx (*unpack)(rtx, rtx, rtx);
    6132              : 
    6133        11932 :       switch (imode)
    6134              :         {
    6135         3116 :         case E_V16QImode:
    6136         3116 :           if (high_p)
    6137              :             unpack = gen_vec_interleave_highv16qi;
    6138              :           else
    6139         1561 :             unpack = gen_vec_interleave_lowv16qi;
    6140              :           break;
    6141         4734 :         case E_V8HImode:
    6142         4734 :           if (high_p)
    6143              :             unpack = gen_vec_interleave_highv8hi;
    6144              :           else
    6145         2367 :             unpack = gen_vec_interleave_lowv8hi;
    6146              :           break;
    6147         2352 :         case E_V4SImode:
    6148         2352 :           if (high_p)
    6149              :             unpack = gen_vec_interleave_highv4si;
    6150              :           else
    6151         1176 :             unpack = gen_vec_interleave_lowv4si;
    6152              :           break;
    6153          590 :         case E_V8QImode:
    6154          590 :           if (high_p)
    6155              :             unpack = gen_mmx_punpckhbw;
    6156              :           else
    6157          295 :             unpack = gen_mmx_punpcklbw;
    6158              :           break;
    6159         1126 :         case E_V4HImode:
    6160         1126 :           if (high_p)
    6161              :             unpack = gen_mmx_punpckhwd;
    6162              :           else
    6163          563 :             unpack = gen_mmx_punpcklwd;
    6164              :           break;
    6165           14 :         case E_V4QImode:
    6166           14 :           if (high_p)
    6167              :             unpack = gen_mmx_punpckhbw_low;
    6168              :           else
    6169            7 :             unpack = gen_mmx_punpcklbw_low;
    6170              :           break;
    6171            0 :         default:
    6172            0 :           gcc_unreachable ();
    6173              :         }
    6174              : 
    6175        11932 :       if (unsigned_p)
    6176         4886 :         tmp = force_reg (imode, CONST0_RTX (imode));
    6177              :       else
    6178         7046 :         tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
    6179              :                                    src, pc_rtx, pc_rtx);
    6180              : 
    6181        11932 :       rtx tmp2 = gen_reg_rtx (imode);
    6182        11932 :       emit_insn (unpack (tmp2, src, tmp));
    6183        11932 :       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
    6184              :     }
    6185        18245 : }
    6186              : 
    6187              : /* Return true if mem is pool constant which contains a const_vector
    6188              :    perm index, assign the index to PERM.  */
    6189              : bool
    6190           35 : ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
    6191              : {
    6192           35 :   machine_mode mode = GET_MODE (mem);
    6193           35 :   int nelt = GET_MODE_NUNITS (mode);
    6194              : 
    6195           35 :   if (!INTEGRAL_MODE_P (mode))
    6196              :     return false;
    6197              : 
    6198              :     /* Needs to be constant pool.  */
    6199           35 :   if (!(MEM_P (mem))
    6200           35 :       || !SYMBOL_REF_P (XEXP (mem, 0))
    6201           70 :       || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
    6202              :    return false;
    6203              : 
    6204           35 :   rtx constant = get_pool_constant (XEXP (mem, 0));
    6205              : 
    6206           35 :   if (!CONST_VECTOR_P (constant))
    6207              :     return false;
    6208              : 
    6209              :   /* There could be some rtx like
    6210              :      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
    6211              :      but with "*.LC1" refer to V2DI constant vector.  */
    6212           35 :   if (GET_MODE (constant) != mode)
    6213              :     {
    6214            0 :       constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
    6215              : 
    6216            0 :       if (constant == nullptr || !CONST_VECTOR_P (constant))
    6217              :         return false;
    6218              :     }
    6219              : 
    6220          771 :   for (int i = 0; i != nelt; i++)
    6221          736 :     perm[i] = UINTVAL (XVECEXP (constant, 0, i));
    6222              : 
    6223              :   return true;
    6224              : }
    6225              : 
    6226              : /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
    6227              :    but works for floating pointer parameters and nonoffsetable memories.
    6228              :    For pushes, it returns just stack offsets; the values will be saved
    6229              :    in the right order.  Maximally three parts are generated.  */
    6230              : 
    6231              : static int
    6232      4106250 : ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
    6233              : {
    6234      4106250 :   int size;
    6235              : 
    6236      4106250 :   if (!TARGET_64BIT)
    6237      1562762 :     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
    6238              :   else
    6239      6648480 :     size = (GET_MODE_SIZE (mode) + 4) / 8;
    6240              : 
    6241      4106250 :   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
    6242      4106250 :   gcc_assert (size >= 2 && size <= 4);
    6243              : 
    6244              :   /* Optimize constant pool reference to immediates.  This is used by fp
    6245              :      moves, that force all constants to memory to allow combining.  */
    6246      4106250 :   if (MEM_P (operand) && MEM_READONLY_P (operand))
    6247        36162 :     operand = avoid_constant_pool_reference (operand);
    6248              : 
    6249      4106250 :   if (MEM_P (operand) && !offsettable_memref_p (operand))
    6250              :     {
    6251              :       /* The only non-offsetable memories we handle are pushes.  */
    6252       184225 :       int ok = push_operand (operand, VOIDmode);
    6253              : 
    6254       184225 :       gcc_assert (ok);
    6255              : 
    6256       184225 :       operand = copy_rtx (operand);
    6257       184225 :       PUT_MODE (operand, word_mode);
    6258       184225 :       parts[0] = parts[1] = parts[2] = parts[3] = operand;
    6259       184225 :       return size;
    6260              :     }
    6261              : 
    6262      3922025 :   if (CONST_VECTOR_P (operand))
    6263              :     {
    6264        41953 :       scalar_int_mode imode = int_mode_for_mode (mode).require ();
    6265              :       /* Caution: if we looked through a constant pool memory above,
    6266              :          the operand may actually have a different mode now.  That's
    6267              :          ok, since we want to pun this all the way back to an integer.  */
    6268        41953 :       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
    6269        41953 :       gcc_assert (operand != NULL);
    6270        41953 :       mode = imode;
    6271              :     }
    6272              : 
    6273      3922025 :   if (!TARGET_64BIT)
    6274              :     {
    6275       623659 :       if (mode == DImode)
    6276       494323 :         split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
    6277              :       else
    6278              :         {
    6279       129336 :           int i;
    6280              : 
    6281       129336 :           if (REG_P (operand))
    6282              :             {
    6283        67150 :               gcc_assert (reload_completed);
    6284       201450 :               for (i = 0; i < size; i++)
    6285       134300 :                 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
    6286              :             }
    6287        62186 :           else if (offsettable_memref_p (operand))
    6288              :             {
    6289        60909 :               operand = adjust_address (operand, SImode, 0);
    6290        60909 :               parts[0] = operand;
    6291       122318 :               for (i = 1; i < size; i++)
    6292        61409 :                 parts[i] = adjust_address (operand, SImode, 4 * i);
    6293              :             }
    6294         1277 :           else if (CONST_DOUBLE_P (operand))
    6295              :             {
    6296         1277 :               const REAL_VALUE_TYPE *r;
    6297         1277 :               long l[4];
    6298              : 
    6299         1277 :               r = CONST_DOUBLE_REAL_VALUE (operand);
    6300         1277 :               switch (mode)
    6301              :                 {
    6302            0 :                 case E_TFmode:
    6303            0 :                   real_to_target (l, r, mode);
    6304            0 :                   parts[3] = gen_int_mode (l[3], SImode);
    6305            0 :                   parts[2] = gen_int_mode (l[2], SImode);
    6306            0 :                   break;
    6307          204 :                 case E_XFmode:
    6308              :                   /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
    6309              :                      long double may not be 80-bit.  */
    6310          204 :                   real_to_target (l, r, mode);
    6311          204 :                   parts[2] = gen_int_mode (l[2], SImode);
    6312          204 :                   break;
    6313         1073 :                 case E_DFmode:
    6314         1073 :                   REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
    6315         1073 :                   break;
    6316            0 :                 default:
    6317            0 :                   gcc_unreachable ();
    6318              :                 }
    6319         1277 :               parts[1] = gen_int_mode (l[1], SImode);
    6320         1277 :               parts[0] = gen_int_mode (l[0], SImode);
    6321              :             }
    6322              :           else
    6323            0 :             gcc_unreachable ();
    6324              :         }
    6325              :     }
    6326              :   else
    6327              :     {
    6328      3298366 :       if (mode == TImode)
    6329      3278497 :         split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
    6330      3298366 :       if (mode == XFmode || mode == TFmode)
    6331              :         {
    6332        19869 :           machine_mode upper_mode = mode==XFmode ? SImode : DImode;
    6333        19869 :           if (REG_P (operand))
    6334              :             {
    6335         1539 :               gcc_assert (reload_completed);
    6336         1539 :               parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
    6337         1539 :               parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
    6338              :             }
    6339        18330 :           else if (offsettable_memref_p (operand))
    6340              :             {
    6341        14663 :               operand = adjust_address (operand, DImode, 0);
    6342        14663 :               parts[0] = operand;
    6343        14663 :               parts[1] = adjust_address (operand, upper_mode, 8);
    6344              :             }
    6345         3667 :           else if (CONST_DOUBLE_P (operand))
    6346              :             {
    6347         3667 :               long l[4];
    6348              : 
    6349         3667 :               real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
    6350              : 
    6351              :               /* real_to_target puts 32-bit pieces in each long.  */
    6352         7334 :               parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
    6353         3667 :                                        | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
    6354         3667 :                                           << 32), DImode);
    6355              : 
    6356         3667 :               if (upper_mode == SImode)
    6357         2733 :                 parts[1] = gen_int_mode (l[2], SImode);
    6358              :               else
    6359          934 :                 parts[1]
    6360          934 :                   = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
    6361          934 :                                   | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
    6362          934 :                                      << 32), DImode);
    6363              :             }
    6364              :           else
    6365            0 :             gcc_unreachable ();
    6366              :         }
    6367              :     }
    6368              : 
    6369              :   return size;
    6370              : }
    6371              : 
    6372              : /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
    6373              :    Return false when normal moves are needed; true when all required
    6374              :    insns have been emitted.  Operands 2-4 contain the input values
    6375              :    int the correct order; operands 5-7 contain the output values.  */
    6376              : 
    6377              : void
    6378      2066038 : ix86_split_long_move (rtx operands[])
    6379              : {
    6380      2066038 :   rtx part[2][4];
    6381      2066038 :   int nparts, i, j;
    6382      2066038 :   int push = 0;
    6383      2066038 :   int collisions = 0;
    6384      2066038 :   machine_mode mode = GET_MODE (operands[0]);
    6385      2066038 :   bool collisionparts[4];
    6386              : 
    6387              :   /* The DFmode expanders may ask us to move double.
    6388              :      For 64bit target this is single move.  By hiding the fact
    6389              :      here we simplify i386.md splitters.  */
    6390      3741071 :   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
    6391              :     {
    6392              :       /* Optimize constant pool reference to immediates.  This is used by
    6393              :          fp moves, that force all constants to memory to allow combining.  */
    6394              : 
    6395        12913 :       if (MEM_P (operands[1])
    6396        12499 :           && SYMBOL_REF_P (XEXP (operands[1], 0))
    6397        13519 :           && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
    6398          117 :         operands[1] = get_pool_constant (XEXP (operands[1], 0));
    6399        12913 :       if (push_operand (operands[0], VOIDmode))
    6400              :         {
    6401        12913 :           operands[0] = copy_rtx (operands[0]);
    6402        12913 :           PUT_MODE (operands[0], word_mode);
    6403              :         }
    6404              :       else
    6405            0 :         operands[0] = gen_lowpart (DImode, operands[0]);
    6406        12913 :       operands[1] = gen_lowpart (DImode, operands[1]);
    6407        12913 :       emit_move_insn (operands[0], operands[1]);
    6408        12913 :       return;
    6409              :     }
    6410              : 
    6411              :   /* The only non-offsettable memory we handle is push.  */
    6412      2053125 :   if (push_operand (operands[0], VOIDmode))
    6413              :     push = 1;
    6414              :   else
    6415      1868900 :     gcc_assert (!MEM_P (operands[0])
    6416              :                 || offsettable_memref_p (operands[0]));
    6417              : 
    6418      2053125 :   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
    6419      2053125 :   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
    6420              : 
    6421              :   /* When emitting push, take care for source operands on the stack.  */
    6422       184225 :   if (push && MEM_P (operands[1])
    6423      2150667 :       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
    6424              :     {
    6425        56742 :       rtx src_base = XEXP (part[1][nparts - 1], 0);
    6426              : 
    6427              :       /* Compensate for the stack decrement by 4.  */
    6428        56742 :       if (!TARGET_64BIT && nparts == 3
    6429        51583 :           && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
    6430            0 :         src_base = plus_constant (Pmode, src_base, 4);
    6431              : 
    6432              :       /* src_base refers to the stack pointer and is
    6433              :          automatically decreased by emitted push.  */
    6434       170505 :       for (i = 0; i < nparts; i++)
    6435       113763 :         part[1][i] = change_address (part[1][i],
    6436       113763 :                                      GET_MODE (part[1][i]), src_base);
    6437              :     }
    6438              : 
    6439              :   /* We need to do copy in the right order in case an address register
    6440              :      of the source overlaps the destination.  */
    6441      2053125 :   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
    6442              :     {
    6443              :       rtx tmp;
    6444              : 
    6445      2341080 :       for (i = 0; i < nparts; i++)
    6446              :         {
    6447      1560720 :           collisionparts[i]
    6448      1560720 :             = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
    6449      1560720 :           if (collisionparts[i])
    6450        16633 :             collisions++;
    6451              :         }
    6452              : 
    6453              :       /* Collision in the middle part can be handled by reordering.  */
    6454       780360 :       if (collisions == 1 && nparts == 3 && collisionparts [1])
    6455              :         {
    6456            0 :           std::swap (part[0][1], part[0][2]);
    6457            0 :           std::swap (part[1][1], part[1][2]);
    6458              :         }
    6459       780360 :       else if (collisions == 1
    6460       780360 :                && nparts == 4
    6461            0 :                && (collisionparts [1] || collisionparts [2]))
    6462              :         {
    6463            0 :           if (collisionparts [1])
    6464              :             {
    6465            0 :               std::swap (part[0][1], part[0][2]);
    6466            0 :               std::swap (part[1][1], part[1][2]);
    6467              :             }
    6468              :           else
    6469              :             {
    6470            0 :               std::swap (part[0][2], part[0][3]);
    6471            0 :               std::swap (part[1][2], part[1][3]);
    6472              :             }
    6473              :         }
    6474              : 
    6475              :       /* If there are more collisions, we can't handle it by reordering.
    6476              :          Do an lea to the last part and use only one colliding move.  */
    6477       780360 :       else if (collisions > 1)
    6478              :         {
    6479           83 :           rtx base, addr;
    6480              : 
    6481           83 :           collisions = 1;
    6482              : 
    6483           83 :           base = part[0][nparts - 1];
    6484              : 
    6485              :           /* Handle the case when the last part isn't valid for lea.
    6486              :              Happens in 64-bit mode storing the 12-byte XFmode.  */
    6487          124 :           if (GET_MODE (base) != Pmode)
    6488            0 :             base = gen_rtx_REG (Pmode, REGNO (base));
    6489              : 
    6490           83 :           addr = XEXP (part[1][0], 0);
    6491           83 :           if (TARGET_TLS_DIRECT_SEG_REFS)
    6492              :             {
    6493           83 :               struct ix86_address parts;
    6494           83 :               int ok = ix86_decompose_address (addr, &parts);
    6495           83 :               gcc_assert (ok);
    6496              :               /* It is not valid to use %gs: or %fs: in lea.  */
    6497           83 :               gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
    6498              :             }
    6499           83 :           emit_insn (gen_rtx_SET (base, addr));
    6500           83 :           part[1][0] = replace_equiv_address (part[1][0], base);
    6501          166 :           for (i = 1; i < nparts; i++)
    6502              :             {
    6503          165 :               tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
    6504           83 :               part[1][i] = replace_equiv_address (part[1][i], tmp);
    6505              :             }
    6506              :         }
    6507              :     }
    6508              : 
    6509      2053125 :   if (push)
    6510              :     {
    6511       184225 :       if (!TARGET_64BIT)
    6512              :         {
    6513       158351 :           if (nparts == 3)
    6514              :             {
    6515          554 :               if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
    6516            0 :                 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
    6517          554 :               emit_move_insn (part[0][2], part[1][2]);
    6518              :             }
    6519       157797 :           else if (nparts == 4)
    6520              :             {
    6521            0 :               emit_move_insn (part[0][3], part[1][3]);
    6522            0 :               emit_move_insn (part[0][2], part[1][2]);
    6523              :             }
    6524              :         }
    6525              :       else
    6526              :         {
    6527              :           /* In 64bit mode we don't have 32bit push available.  In case this is
    6528              :              register, it is OK - we will just use larger counterpart.  We also
    6529              :              retype memory - these comes from attempt to avoid REX prefix on
    6530              :              moving of second half of TFmode value.  */
    6531        25874 :           if (GET_MODE (part[1][1]) == SImode)
    6532              :             {
    6533        11796 :               switch (GET_CODE (part[1][1]))
    6534              :                 {
    6535        11356 :                 case MEM:
    6536        11356 :                   part[1][1] = adjust_address (part[1][1], DImode, 0);
    6537        11356 :                   break;
    6538              : 
    6539          440 :                 case REG:
    6540          440 :                   part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
    6541          440 :                   break;
    6542              : 
    6543            0 :                 default:
    6544            0 :                   gcc_unreachable ();
    6545              :                 }
    6546              : 
    6547        11796 :               if (GET_MODE (part[1][0]) == SImode)
    6548            0 :                 part[1][0] = part[1][1];
    6549              :             }
    6550              :         }
    6551       184225 :       emit_move_insn (part[0][1], part[1][1]);
    6552       184225 :       emit_move_insn (part[0][0], part[1][0]);
    6553       184225 :       return;
    6554              :     }
    6555              : 
    6556              :   /* Choose correct order to not overwrite the source before it is copied.  */
    6557      1868900 :   if ((REG_P (part[0][0])
    6558      1019600 :        && REG_P (part[1][1])
    6559        81457 :        && (REGNO (part[0][0]) == REGNO (part[1][1])
    6560        66368 :            || (nparts == 3
    6561            0 :                && REGNO (part[0][0]) == REGNO (part[1][2]))
    6562        66368 :            || (nparts == 4
    6563            0 :                && REGNO (part[0][0]) == REGNO (part[1][3]))))
    6564      2873411 :       || (collisions > 0
    6565        16550 :           && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
    6566              :     {
    6567        94128 :       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
    6568              :         {
    6569        62752 :           operands[2 + i] = part[0][j];
    6570        62752 :           operands[6 + i] = part[1][j];
    6571              :         }
    6572              :     }
    6573              :   else
    6574              :     {
    6575      5512647 :       for (i = 0; i < nparts; i++)
    6576              :         {
    6577      3675123 :           operands[2 + i] = part[0][i];
    6578      3675123 :           operands[6 + i] = part[1][i];
    6579              :         }
    6580              :     }
    6581              : 
    6582              :   /* Attempt to locally unCSE nonzero constants.  */
    6583      3737875 :   for (j = 0; j < nparts - 1; j++)
    6584      1868975 :     if (CONST_INT_P (operands[6 + j])
    6585       224149 :         && operands[6 + j] != const0_rtx
    6586        63397 :         && REG_P (operands[2 + j]))
    6587       112992 :       for (i = j; i < nparts - 1; i++)
    6588        56496 :         if (CONST_INT_P (operands[7 + i])
    6589        56496 :             && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
    6590        22572 :           operands[7 + i] = operands[2 + j];
    6591              : 
    6592      5606775 :   for (i = 0; i < nparts; i++)
    6593      3737875 :     emit_move_insn (operands[2 + i], operands[6 + i]);
    6594              : 
    6595              :   return;
    6596              : }
    6597              : 
    6598              : /* Helper function of ix86_split_ashl used to generate an SImode/DImode
    6599              :    left shift by a constant, either using a single shift or
    6600              :    a sequence of add instructions.  */
    6601              : 
    6602              : static void
    6603         4343 : ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
    6604              : {
    6605         4343 :   if (count == 1
    6606         4343 :       || (count * ix86_cost->add <= ix86_cost->shift_const
    6607            0 :           && !optimize_insn_for_size_p ()))
    6608              :     {
    6609           16 :       while (count-- > 0)
    6610            8 :         emit_insn (gen_add2_insn (operand, operand));
    6611              :     }
    6612              :   else
    6613              :     {
    6614         4335 :       rtx (*insn)(rtx, rtx, rtx);
    6615              : 
    6616         4335 :       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
    6617         4335 :       emit_insn (insn (operand, operand, GEN_INT (count)));
    6618              :     }
    6619         4343 : }
    6620              : 
    6621              : void
    6622        10279 : ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
    6623              : {
    6624        10279 :   rtx (*gen_ashl3)(rtx, rtx, rtx);
    6625        10279 :   rtx (*gen_shld)(rtx, rtx, rtx);
    6626        10279 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6627        10279 :   machine_mode half_mode;
    6628              : 
    6629        10279 :   rtx low[2], high[2];
    6630        10279 :   int count;
    6631              : 
    6632        10279 :   if (CONST_INT_P (operands[2]))
    6633              :     {
    6634         8582 :       split_double_mode (mode, operands, 2, low, high);
    6635         8582 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6636              : 
    6637         8582 :       if (count >= half_width)
    6638              :         {
    6639         2464 :           emit_move_insn (high[0], low[1]);
    6640         2464 :           ix86_expand_clear (low[0]);
    6641              : 
    6642         2464 :           if (count > half_width)
    6643          141 :             ix86_expand_ashl_const (high[0], count - half_width, mode);
    6644              :         }
    6645         6118 :       else if (count == 1)
    6646              :         {
    6647         1916 :           if (!rtx_equal_p (operands[0], operands[1]))
    6648            0 :             emit_move_insn (operands[0], operands[1]);
    6649         1916 :           rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
    6650         1916 :           rtx x4 = gen_rtx_LTU (mode, x3, const0_rtx);
    6651         1916 :           half_mode = mode == DImode ? SImode : DImode;
    6652         1916 :           emit_insn (gen_add3_cc_overflow_1 (half_mode, low[0],
    6653              :                                              low[0], low[0]));
    6654         1916 :           emit_insn (gen_add3_carry (half_mode, high[0], high[0], high[0],
    6655              :                                      x3, x4));
    6656              :         }
    6657              :       else
    6658              :         {
    6659         4202 :           gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
    6660              : 
    6661         4202 :           if (!rtx_equal_p (operands[0], operands[1]))
    6662            0 :             emit_move_insn (operands[0], operands[1]);
    6663              : 
    6664         4202 :           emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
    6665         4202 :           ix86_expand_ashl_const (low[0], count, mode);
    6666              :         }
    6667         8852 :       return;
    6668              :     }
    6669              : 
    6670         1697 :   split_double_mode (mode, operands, 1, low, high);
    6671         1697 :   half_mode = mode == DImode ? SImode : DImode;
    6672              : 
    6673         1697 :   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
    6674              : 
    6675         1697 :   if (operands[1] == const1_rtx)
    6676              :     {
    6677              :       /* Assuming we've chosen a QImode capable registers, then 1 << N
    6678              :          can be done with two 32/64-bit shifts, no branches, no cmoves.  */
    6679          270 :       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
    6680              :         {
    6681          159 :           rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
    6682              : 
    6683          159 :           ix86_expand_clear (low[0]);
    6684          159 :           ix86_expand_clear (high[0]);
    6685          159 :           emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
    6686              : 
    6687          159 :           d = gen_lowpart (QImode, low[0]);
    6688          159 :           d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
    6689          159 :           s = gen_rtx_EQ (QImode, flags, const0_rtx);
    6690          159 :           emit_insn (gen_rtx_SET (d, s));
    6691              : 
    6692          159 :           d = gen_lowpart (QImode, high[0]);
    6693          159 :           d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
    6694          159 :           s = gen_rtx_NE (QImode, flags, const0_rtx);
    6695          159 :           emit_insn (gen_rtx_SET (d, s));
    6696              :         }
    6697              : 
    6698              :       /* Otherwise, we can get the same results by manually performing
    6699              :          a bit extract operation on bit 5/6, and then performing the two
    6700              :          shifts.  The two methods of getting 0/1 into low/high are exactly
    6701              :          the same size.  Avoiding the shift in the bit extract case helps
    6702              :          pentium4 a bit; no one else seems to care much either way.  */
    6703              :       else
    6704              :         {
    6705          111 :           rtx (*gen_lshr3)(rtx, rtx, rtx);
    6706          111 :           rtx (*gen_and3)(rtx, rtx, rtx);
    6707          111 :           rtx (*gen_xor3)(rtx, rtx, rtx);
    6708          111 :           HOST_WIDE_INT bits;
    6709          111 :           rtx x;
    6710              : 
    6711          111 :           if (mode == DImode)
    6712              :             {
    6713              :               gen_lshr3 = gen_lshrsi3;
    6714              :               gen_and3 = gen_andsi3;
    6715              :               gen_xor3 = gen_xorsi3;
    6716              :               bits = 5;
    6717              :             }
    6718              :           else
    6719              :             {
    6720            0 :               gen_lshr3 = gen_lshrdi3;
    6721            0 :               gen_and3 = gen_anddi3;
    6722            0 :               gen_xor3 = gen_xordi3;
    6723            0 :               bits = 6;
    6724              :             }
    6725              : 
    6726          111 :           if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
    6727            0 :             x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
    6728              :           else
    6729          111 :             x = gen_lowpart (half_mode, operands[2]);
    6730          111 :           emit_insn (gen_rtx_SET (high[0], x));
    6731              : 
    6732          111 :           emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
    6733          111 :           emit_insn (gen_and3 (high[0], high[0], const1_rtx));
    6734          111 :           emit_move_insn (low[0], high[0]);
    6735          111 :           emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
    6736              :         }
    6737              : 
    6738          270 :       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
    6739          270 :       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
    6740          270 :       return;
    6741              :     }
    6742              : 
    6743         1427 :   if (operands[1] == constm1_rtx)
    6744              :     {
    6745              :       /* For -1 << N, we can avoid the shld instruction, because we
    6746              :          know that we're shifting 0...31/63 ones into a -1.  */
    6747          118 :       emit_move_insn (low[0], constm1_rtx);
    6748          118 :       if (optimize_insn_for_size_p ())
    6749            6 :         emit_move_insn (high[0], low[0]);
    6750              :       else
    6751          112 :         emit_move_insn (high[0], constm1_rtx);
    6752              :     }
    6753              :   else
    6754              :     {
    6755         1309 :       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
    6756              : 
    6757         1309 :       if (!rtx_equal_p (operands[0], operands[1]))
    6758            0 :         emit_move_insn (operands[0], operands[1]);
    6759              : 
    6760         1309 :       split_double_mode (mode, operands, 1, low, high);
    6761         1309 :       emit_insn (gen_shld (high[0], low[0], operands[2]));
    6762              :     }
    6763              : 
    6764         1427 :   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
    6765              : 
    6766         1427 :   if (TARGET_CMOVE && scratch)
    6767              :     {
    6768          949 :       ix86_expand_clear (scratch);
    6769          949 :       emit_insn (gen_x86_shift_adj_1
    6770              :                  (half_mode, high[0], low[0], operands[2], scratch));
    6771              :     }
    6772              :   else
    6773          478 :     emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
    6774              : }
    6775              : 
    6776              : void
    6777         6038 : ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
    6778              : {
    6779         4798 :   rtx (*gen_ashr3)(rtx, rtx, rtx)
    6780         6038 :     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
    6781         6038 :   rtx (*gen_shrd)(rtx, rtx, rtx);
    6782         6038 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6783              : 
    6784         6038 :   rtx low[2], high[2];
    6785         6038 :   int count;
    6786              : 
    6787         6038 :   if (CONST_INT_P (operands[2]))
    6788              :     {
    6789         5861 :       split_double_mode (mode, operands, 2, low, high);
    6790         5861 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6791              : 
    6792        11722 :       if (count == GET_MODE_BITSIZE (mode) - 1)
    6793              :         {
    6794           86 :           emit_move_insn (high[0], high[1]);
    6795           86 :           emit_insn (gen_ashr3 (high[0], high[0],
    6796           86 :                                 GEN_INT (half_width - 1)));
    6797           86 :           emit_move_insn (low[0], high[0]);
    6798              : 
    6799              :         }
    6800         5775 :       else if (count >= half_width)
    6801              :         {
    6802         1619 :           emit_move_insn (low[0], high[1]);
    6803         1619 :           emit_move_insn (high[0], low[0]);
    6804         1619 :           emit_insn (gen_ashr3 (high[0], high[0],
    6805         1619 :                                 GEN_INT (half_width - 1)));
    6806              : 
    6807         1619 :           if (count > half_width)
    6808           38 :             emit_insn (gen_ashr3 (low[0], low[0],
    6809           38 :                                   GEN_INT (count - half_width)));
    6810              :         }
    6811         4156 :       else if (count == 1
    6812          766 :                && (TARGET_USE_RCR || optimize_size > 1))
    6813              :         {
    6814            1 :           if (!rtx_equal_p (operands[0], operands[1]))
    6815            0 :             emit_move_insn (operands[0], operands[1]);
    6816            1 :           if (mode == DImode)
    6817              :             {
    6818            0 :               emit_insn (gen_ashrsi3_carry (high[0], high[0]));
    6819            0 :               emit_insn (gen_rcrsi2 (low[0], low[0]));
    6820              :             }
    6821              :           else
    6822              :             {
    6823            1 :               emit_insn (gen_ashrdi3_carry (high[0], high[0]));
    6824            1 :               emit_insn (gen_rcrdi2 (low[0], low[0]));
    6825              :             }
    6826              :         }
    6827              :       else
    6828              :         {
    6829         4155 :           gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6830              : 
    6831         4155 :           if (!rtx_equal_p (operands[0], operands[1]))
    6832            0 :             emit_move_insn (operands[0], operands[1]);
    6833              : 
    6834         4155 :           emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
    6835         4155 :           emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
    6836              :         }
    6837              :     }
    6838              :   else
    6839              :     {
    6840          177 :       machine_mode half_mode;
    6841              : 
    6842          177 :       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6843              : 
    6844          177 :      if (!rtx_equal_p (operands[0], operands[1]))
    6845            0 :         emit_move_insn (operands[0], operands[1]);
    6846              : 
    6847          177 :       split_double_mode (mode, operands, 1, low, high);
    6848          177 :       half_mode = mode == DImode ? SImode : DImode;
    6849              : 
    6850          177 :       emit_insn (gen_shrd (low[0], high[0], operands[2]));
    6851          177 :       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
    6852              : 
    6853          177 :       if (TARGET_CMOVE && scratch)
    6854              :         {
    6855          140 :           emit_move_insn (scratch, high[0]);
    6856          140 :           emit_insn (gen_ashr3 (scratch, scratch,
    6857          140 :                                 GEN_INT (half_width - 1)));
    6858          140 :           emit_insn (gen_x86_shift_adj_1
    6859              :                      (half_mode, low[0], high[0], operands[2], scratch));
    6860              :         }
    6861              :       else
    6862           37 :         emit_insn (gen_x86_shift_adj_3
    6863              :                    (half_mode, low[0], high[0], operands[2]));
    6864              :     }
    6865         6038 : }
    6866              : 
    6867              : void
    6868        14067 : ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
    6869              : {
    6870         6725 :   rtx (*gen_lshr3)(rtx, rtx, rtx)
    6871        14067 :     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
    6872        14067 :   rtx (*gen_shrd)(rtx, rtx, rtx);
    6873        14067 :   int half_width = GET_MODE_BITSIZE (mode) >> 1;
    6874              : 
    6875        14067 :   rtx low[2], high[2];
    6876        14067 :   int count;
    6877              : 
    6878        14067 :   if (CONST_INT_P (operands[2]))
    6879              :     {
    6880        12681 :       split_double_mode (mode, operands, 2, low, high);
    6881        12681 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
    6882              : 
    6883        12681 :       if (count >= half_width)
    6884              :         {
    6885         8465 :           emit_move_insn (low[0], high[1]);
    6886         8465 :           ix86_expand_clear (high[0]);
    6887              : 
    6888         8465 :           if (count > half_width)
    6889          651 :             emit_insn (gen_lshr3 (low[0], low[0],
    6890          651 :                                   GEN_INT (count - half_width)));
    6891              :         }
    6892         4216 :       else if (count == 1
    6893          805 :                && (TARGET_USE_RCR || optimize_size > 1))
    6894              :         {
    6895            1 :           if (!rtx_equal_p (operands[0], operands[1]))
    6896            0 :             emit_move_insn (operands[0], operands[1]);
    6897            1 :           if (mode == DImode)
    6898              :             {
    6899            0 :               emit_insn (gen_lshrsi3_carry (high[0], high[0]));
    6900            0 :               emit_insn (gen_rcrsi2 (low[0], low[0]));
    6901              :             }
    6902              :           else
    6903              :             {
    6904            1 :               emit_insn (gen_lshrdi3_carry (high[0], high[0]));
    6905            1 :               emit_insn (gen_rcrdi2 (low[0], low[0]));
    6906              :             }
    6907              :         }
    6908              :       else
    6909              :         {
    6910         4215 :           gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6911              : 
    6912         4215 :           if (!rtx_equal_p (operands[0], operands[1]))
    6913            0 :             emit_move_insn (operands[0], operands[1]);
    6914              : 
    6915         4215 :           emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
    6916         4215 :           emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
    6917              :         }
    6918              :     }
    6919              :   else
    6920              :     {
    6921         1386 :       machine_mode half_mode;
    6922              : 
    6923         1386 :       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
    6924              : 
    6925         1386 :       if (!rtx_equal_p (operands[0], operands[1]))
    6926            0 :         emit_move_insn (operands[0], operands[1]);
    6927              : 
    6928         1386 :       split_double_mode (mode, operands, 1, low, high);
    6929         1386 :       half_mode = mode == DImode ? SImode : DImode;
    6930              : 
    6931         1386 :       emit_insn (gen_shrd (low[0], high[0], operands[2]));
    6932         1386 :       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
    6933              : 
    6934         1386 :       if (TARGET_CMOVE && scratch)
    6935              :         {
    6936         1123 :           ix86_expand_clear (scratch);
    6937         1123 :           emit_insn (gen_x86_shift_adj_1
    6938              :                      (half_mode, low[0], high[0], operands[2], scratch));
    6939              :         }
    6940              :       else
    6941          263 :         emit_insn (gen_x86_shift_adj_2
    6942              :                    (half_mode, low[0], high[0], operands[2]));
    6943              :     }
    6944        14067 : }
    6945              : 
    6946              : /* Helper function to split TImode ashl under NDD.  */
    6947              : void
    6948            1 : ix86_split_ashl_ndd (rtx *operands, rtx scratch)
    6949              : {
    6950            1 :   gcc_assert (TARGET_APX_NDD);
    6951            1 :   int half_width = GET_MODE_BITSIZE (TImode) >> 1;
    6952              : 
    6953            1 :   rtx low[2], high[2];
    6954            1 :   int count;
    6955              : 
    6956            1 :   split_double_mode (TImode, operands, 2, low, high);
    6957            1 :   if (CONST_INT_P (operands[2]))
    6958              :     {
    6959            0 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
    6960              : 
    6961            0 :       if (count >= half_width)
    6962              :         {
    6963            0 :           count = count - half_width;
    6964            0 :           if (count == 0)
    6965              :             {
    6966            0 :               if (!rtx_equal_p (high[0], low[1]))
    6967            0 :                 emit_move_insn (high[0], low[1]);
    6968              :             }
    6969            0 :           else if (count == 1)
    6970            0 :             emit_insn (gen_adddi3 (high[0], low[1], low[1]));
    6971              :           else
    6972            0 :             emit_insn (gen_ashldi3 (high[0], low[1], GEN_INT (count)));
    6973              : 
    6974            0 :           ix86_expand_clear (low[0]);
    6975              :         }
    6976            0 :       else if (count == 1)
    6977              :         {
    6978            0 :           rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
    6979            0 :           rtx x4 = gen_rtx_LTU (TImode, x3, const0_rtx);
    6980            0 :           emit_insn (gen_add3_cc_overflow_1 (DImode, low[0],
    6981              :                                              low[1], low[1]));
    6982            0 :           emit_insn (gen_add3_carry (DImode, high[0], high[1], high[1],
    6983              :                                      x3, x4));
    6984              :         }
    6985              :       else
    6986              :         {
    6987            0 :           emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
    6988              :                                           GEN_INT (count)));
    6989            0 :           emit_insn (gen_ashldi3 (low[0], low[1], GEN_INT (count)));
    6990              :         }
    6991              :     }
    6992              :   else
    6993              :     {
    6994            1 :       emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
    6995              :                                       operands[2]));
    6996            1 :       emit_insn (gen_ashldi3 (low[0], low[1], operands[2]));
    6997            1 :       if (TARGET_CMOVE && scratch)
    6998              :         {
    6999            1 :           ix86_expand_clear (scratch);
    7000            1 :           emit_insn (gen_x86_shift_adj_1
    7001              :                      (DImode, high[0], low[0], operands[2], scratch));
    7002              :         }
    7003              :       else
    7004            0 :         emit_insn (gen_x86_shift_adj_2 (DImode, high[0], low[0], operands[2]));
    7005              :     }
    7006            1 : }
    7007              : 
    7008              : /* Helper function to split TImode l/ashr under NDD.  */
    7009              : void
    7010            2 : ix86_split_rshift_ndd (enum rtx_code code, rtx *operands, rtx scratch)
    7011              : {
    7012            2 :   gcc_assert (TARGET_APX_NDD);
    7013            2 :   int half_width = GET_MODE_BITSIZE (TImode) >> 1;
    7014            2 :   bool ashr_p = code == ASHIFTRT;
    7015            2 :   rtx (*gen_shr)(rtx, rtx, rtx) = ashr_p ? gen_ashrdi3
    7016              :                                          : gen_lshrdi3;
    7017              : 
    7018            2 :   rtx low[2], high[2];
    7019            2 :   int count;
    7020              : 
    7021            2 :   split_double_mode (TImode, operands, 2, low, high);
    7022            2 :   if (CONST_INT_P (operands[2]))
    7023              :     {
    7024            0 :       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
    7025              : 
    7026            0 :       if (ashr_p && (count == GET_MODE_BITSIZE (TImode) - 1))
    7027              :         {
    7028            0 :           emit_insn (gen_shr (high[0], high[1],
    7029              :                               GEN_INT (half_width - 1)));
    7030            0 :           emit_move_insn (low[0], high[0]);
    7031              :         }
    7032            0 :       else if (count >= half_width)
    7033              :         {
    7034            0 :           if (ashr_p)
    7035            0 :             emit_insn (gen_shr (high[0], high[1],
    7036              :                                 GEN_INT (half_width - 1)));
    7037              :           else
    7038            0 :             ix86_expand_clear (high[0]);
    7039              : 
    7040            0 :           if (count > half_width)
    7041            0 :             emit_insn (gen_shr (low[0], high[1],
    7042            0 :                                 GEN_INT (count - half_width)));
    7043              :           else
    7044            0 :             emit_move_insn (low[0], high[1]);
    7045              :         }
    7046              :       else
    7047              :         {
    7048            0 :           emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
    7049              :                                           GEN_INT (count)));
    7050            0 :           emit_insn (gen_shr (high[0], high[1], GEN_INT (count)));
    7051              :         }
    7052              :     }
    7053              :   else
    7054              :     {
    7055            2 :       emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
    7056              :                                       operands[2]));
    7057            2 :       emit_insn (gen_shr (high[0], high[1], operands[2]));
    7058              : 
    7059            2 :       if (TARGET_CMOVE && scratch)
    7060              :         {
    7061            2 :           if (ashr_p)
    7062              :             {
    7063            1 :               emit_move_insn (scratch, high[0]);
    7064            1 :               emit_insn (gen_shr (scratch, scratch,
    7065              :                                   GEN_INT (half_width - 1)));
    7066              :             }
    7067              :           else
    7068            1 :             ix86_expand_clear (scratch);
    7069              : 
    7070            2 :           emit_insn (gen_x86_shift_adj_1
    7071              :                      (DImode, low[0], high[0], operands[2], scratch));
    7072              :         }
    7073            0 :       else if (ashr_p)
    7074            0 :         emit_insn (gen_x86_shift_adj_3
    7075              :                    (DImode, low[0], high[0], operands[2]));
    7076              :       else
    7077            0 :         emit_insn (gen_x86_shift_adj_2
    7078              :                    (DImode, low[0], high[0], operands[2]));
    7079              :     }
    7080            2 : }
    7081              : 
    7082              : /* Expand move of V1TI mode register X to a new TI mode register.  */
    7083              : static rtx
    7084           17 : ix86_expand_v1ti_to_ti (rtx x)
    7085              : {
    7086           17 :   rtx result = gen_reg_rtx (TImode);
    7087           17 :   if (TARGET_SSE2)
    7088              :     {
    7089           17 :       rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
    7090           17 :       rtx lo = gen_lowpart (DImode, result);
    7091           17 :       emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
    7092           17 :       rtx hi = gen_highpart (DImode, result);
    7093           17 :       emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
    7094              :     }
    7095              :   else
    7096            0 :     emit_move_insn (result, gen_lowpart (TImode, x));
    7097           17 :   return result;
    7098              : }
    7099              : 
    7100              : /* Expand move of TI mode register X to a new V1TI mode register.  */
    7101              : static rtx
    7102           17 : ix86_expand_ti_to_v1ti (rtx x)
    7103              : {
    7104           17 :   if (TARGET_SSE2)
    7105              :     {
    7106           17 :       rtx lo = gen_lowpart (DImode, x);
    7107           17 :       rtx hi = gen_highpart (DImode, x);
    7108           17 :       rtx tmp = gen_reg_rtx (V2DImode);
    7109           17 :       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
    7110           17 :       return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
    7111              :     }
    7112              : 
    7113            0 :   return force_reg (V1TImode, gen_lowpart (V1TImode, x));
    7114              : }
    7115              : 
    7116              : /* Expand V1TI mode shift (of rtx_code CODE) by constant.  */
    7117              : void
    7118           42 : ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
    7119              : {
    7120           42 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7121              : 
    7122           42 :   if (!CONST_INT_P (operands[2]))
    7123              :     {
    7124            6 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7125            6 :       rtx tmp2 = gen_reg_rtx (TImode);
    7126            3 :       rtx (*shift) (rtx, rtx, rtx)
    7127            6 :             = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
    7128            6 :       emit_insn (shift (tmp2, tmp1, operands[2]));
    7129            6 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7130            6 :       emit_move_insn (operands[0], tmp3);
    7131            6 :       return;
    7132              :     }
    7133              : 
    7134           36 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7135              : 
    7136           36 :   if (bits == 0)
    7137              :     {
    7138            0 :       emit_move_insn (operands[0], op1);
    7139            0 :       return;
    7140              :     }
    7141              : 
    7142           36 :   if ((bits & 7) == 0)
    7143              :     {
    7144            0 :       rtx tmp = gen_reg_rtx (V1TImode);
    7145            0 :       if (code == ASHIFT)
    7146            0 :         emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
    7147              :       else
    7148            0 :         emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
    7149            0 :       emit_move_insn (operands[0], tmp);
    7150            0 :       return;
    7151              :     }
    7152              : 
    7153           36 :   rtx tmp1 = gen_reg_rtx (V1TImode);
    7154           36 :   if (code == ASHIFT)
    7155           18 :     emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
    7156              :   else
    7157           18 :     emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
    7158              : 
    7159              :   /* tmp2 is operands[1] shifted by 64, in V2DImode.  */
    7160           36 :   rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7161              : 
    7162              :   /* tmp3 will be the V2DImode result.  */
    7163           36 :   rtx tmp3 = gen_reg_rtx (V2DImode);
    7164              : 
    7165           36 :   if (bits > 64)
    7166              :     {
    7167           18 :       if (code == ASHIFT)
    7168            9 :         emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
    7169              :       else
    7170            9 :         emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
    7171              :     }
    7172              :   else
    7173              :     {
    7174              :       /* tmp4 is operands[1], in V2DImode.  */
    7175           18 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7176              : 
    7177           18 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7178           18 :       if (code == ASHIFT)
    7179            9 :         emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7180              :       else
    7181            9 :         emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7182              : 
    7183           18 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7184           18 :       if (code == ASHIFT)
    7185            9 :         emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
    7186              :       else
    7187            9 :         emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
    7188              : 
    7189           18 :       emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
    7190              :     }
    7191              : 
    7192              :   /* Convert the result back to V1TImode and store in operands[0].  */
    7193           36 :   rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7194           36 :   emit_move_insn (operands[0], tmp7);
    7195              : }
    7196              : 
    7197              : /* Expand V1TI mode rotate (of rtx_code CODE) by constant.  */
    7198              : void
    7199           39 : ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
    7200              : {
    7201           39 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7202              : 
    7203           39 :   if (!CONST_INT_P (operands[2]))
    7204              :     {
    7205            8 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7206            8 :       rtx tmp2 = gen_reg_rtx (TImode);
    7207            4 :       rtx (*rotate) (rtx, rtx, rtx)
    7208            8 :             = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
    7209            8 :       emit_insn (rotate (tmp2, tmp1, operands[2]));
    7210            8 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7211            8 :       emit_move_insn (operands[0], tmp3);
    7212            8 :       return;
    7213              :     }
    7214              : 
    7215           31 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7216              : 
    7217           31 :   if (bits == 0)
    7218              :     {
    7219            0 :       emit_move_insn (operands[0], op1);
    7220            0 :       return;
    7221              :     }
    7222              : 
    7223           31 :   if (code == ROTATERT)
    7224           16 :     bits = 128 - bits;
    7225              : 
    7226           31 :   if ((bits & 31) == 0)
    7227              :     {
    7228            5 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7229            5 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7230            5 :       if (bits == 32)
    7231            1 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
    7232            4 :       else if (bits == 64)
    7233            2 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
    7234              :       else
    7235            2 :         emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
    7236            5 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
    7237            5 :       return;
    7238              :     }
    7239              : 
    7240           26 :   if ((bits & 7) == 0)
    7241              :     {
    7242            6 :       rtx tmp1 = gen_reg_rtx (V1TImode);
    7243            6 :       rtx tmp2 = gen_reg_rtx (V1TImode);
    7244            6 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7245              : 
    7246            6 :       emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
    7247            6 :       emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
    7248            6 :       emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
    7249            6 :       emit_move_insn (operands[0], tmp3);
    7250            6 :       return;
    7251              :     }
    7252              : 
    7253           20 :   rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7254              : 
    7255           20 :   rtx lobits;
    7256           20 :   rtx hibits;
    7257              : 
    7258           20 :   switch (bits >> 5)
    7259              :     {
    7260            7 :     case 0:
    7261            7 :       lobits = op1_v4si;
    7262            7 :       hibits = gen_reg_rtx (V4SImode);
    7263            7 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
    7264            7 :       break;
    7265              : 
    7266            2 :     case 1:
    7267            2 :       lobits = gen_reg_rtx (V4SImode);
    7268            2 :       hibits = gen_reg_rtx (V4SImode);
    7269            2 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
    7270            2 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
    7271            2 :       break;
    7272              : 
    7273            2 :     case 2:
    7274            2 :       lobits = gen_reg_rtx (V4SImode);
    7275            2 :       hibits = gen_reg_rtx (V4SImode);
    7276            2 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
    7277            2 :       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
    7278            2 :       break;
    7279              : 
    7280            9 :     default:
    7281            9 :       lobits = gen_reg_rtx (V4SImode);
    7282            9 :       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
    7283            9 :       hibits = op1_v4si;
    7284            9 :       break;
    7285              :     }
    7286              : 
    7287           20 :   rtx tmp1 = gen_reg_rtx (V4SImode);
    7288           20 :   rtx tmp2 = gen_reg_rtx (V4SImode);
    7289           20 :   rtx tmp3 = gen_reg_rtx (V4SImode);
    7290              : 
    7291           20 :   emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
    7292           20 :   emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
    7293           20 :   emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
    7294              : 
    7295           20 :   emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
    7296              : }
    7297              : 
    7298              : /* Expand V1TI mode ashiftrt by constant.  */
    7299              : void
    7300          109 : ix86_expand_v1ti_ashiftrt (rtx operands[])
    7301              : {
    7302          109 :   rtx op1 = force_reg (V1TImode, operands[1]);
    7303              : 
    7304          109 :   if (!CONST_INT_P (operands[2]))
    7305              :     {
    7306            3 :       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
    7307            3 :       rtx tmp2 = gen_reg_rtx (TImode);
    7308            3 :       emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
    7309            3 :       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
    7310            3 :       emit_move_insn (operands[0], tmp3);
    7311            3 :       return;
    7312              :     }
    7313              : 
    7314          106 :   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
    7315              : 
    7316          106 :   if (bits == 0)
    7317              :     {
    7318            0 :       emit_move_insn (operands[0], op1);
    7319            0 :       return;
    7320              :     }
    7321              : 
    7322          106 :   if (bits == 127)
    7323              :     {
    7324              :       /* Two operations.  */
    7325            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7326            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7327            3 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7328              : 
    7329            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7330            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7331              : 
    7332            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
    7333            3 :       return;
    7334              :     }
    7335              : 
    7336          103 :   if (bits == 64)
    7337              :     {
    7338              :       /* Three operations.  */
    7339            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7340            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7341            3 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7342              : 
    7343            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7344            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7345              : 
    7346            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7347            3 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7348            3 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7349            3 :       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
    7350              : 
    7351            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7352            3 :       return;
    7353              :     }
    7354              : 
    7355          100 :   if (bits == 96)
    7356              :     {
    7357              :       /* Three operations.  */
    7358            3 :       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
    7359            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7360            3 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
    7361              : 
    7362            3 :       rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7363            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
    7364            3 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7365            3 :       emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
    7366              : 
    7367            3 :       rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
    7368            3 :       rtx tmp7 = gen_reg_rtx (V4SImode);
    7369            3 :       emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
    7370              : 
    7371            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
    7372            3 :       return;
    7373              :     }
    7374              : 
    7375           97 :   if (bits >= 111)
    7376              :     {
    7377              :       /* Three operations.  */
    7378           21 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7379           21 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7380           21 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
    7381              : 
    7382           21 :       rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7383           21 :       rtx tmp4 = gen_reg_rtx (V8HImode);
    7384           21 :       emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
    7385              : 
    7386           21 :       rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
    7387           21 :       rtx tmp6 = gen_reg_rtx (V4SImode);
    7388           21 :       emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
    7389              : 
    7390           21 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7391           21 :       return;
    7392              :     }
    7393              : 
    7394           76 :   if (TARGET_AVX2 || TARGET_SSE4_1)
    7395              :     {
    7396              :       /* Three operations.  */
    7397           50 :       if (bits == 32)
    7398              :         {
    7399            2 :           rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7400            2 :           rtx tmp2 = gen_reg_rtx (V4SImode);
    7401            2 :           emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
    7402              : 
    7403            2 :           rtx tmp3 = gen_reg_rtx (V1TImode);
    7404            2 :           emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
    7405              : 
    7406            2 :           if (TARGET_AVX2)
    7407              :             {
    7408            1 :               rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
    7409            1 :               rtx tmp5 = gen_reg_rtx (V4SImode);
    7410            1 :               emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
    7411              :                                                GEN_INT (7)));
    7412              : 
    7413            1 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
    7414              :             }
    7415              :           else
    7416              :             {
    7417            1 :               rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7418            1 :               rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7419            1 :               rtx tmp6 = gen_reg_rtx (V8HImode);
    7420            1 :               emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
    7421              :                                              GEN_INT (0x3f)));
    7422              : 
    7423            1 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7424              :             }
    7425            2 :           return;
    7426              :         }
    7427              : 
    7428              :       /* Three operations.  */
    7429           48 :       if (bits == 8 || bits == 16 || bits == 24)
    7430              :         {
    7431            6 :           rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7432            6 :           rtx tmp2 = gen_reg_rtx (V4SImode);
    7433            6 :           emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7434              : 
    7435            6 :           rtx tmp3 = gen_reg_rtx (V1TImode);
    7436            6 :           emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
    7437              : 
    7438            6 :           if (TARGET_AVX2)
    7439              :             {
    7440            3 :               rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
    7441            3 :               rtx tmp5 = gen_reg_rtx (V4SImode);
    7442            3 :               emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
    7443              :                                                GEN_INT (7)));
    7444              : 
    7445            3 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
    7446              :             }
    7447              :           else
    7448              :             {
    7449            3 :               rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7450            3 :               rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7451            3 :               rtx tmp6 = gen_reg_rtx (V8HImode);
    7452            3 :               emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
    7453              :                                              GEN_INT (0x3f)));
    7454              : 
    7455            3 :               emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
    7456              :             }
    7457            6 :           return;
    7458              :         }
    7459              :     }
    7460              : 
    7461           68 :   if (bits > 96)
    7462              :     {
    7463              :       /* Four operations.  */
    7464            3 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7465            3 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7466            3 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
    7467              : 
    7468            3 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7469            3 :       emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
    7470              : 
    7471            3 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
    7472            3 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7473            3 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7474            3 :       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
    7475              : 
    7476            3 :       rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
    7477            3 :       rtx tmp8 = gen_reg_rtx (V4SImode);
    7478            3 :       emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
    7479              : 
    7480            3 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
    7481            3 :       return;
    7482              :     }
    7483              : 
    7484           65 :   if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
    7485              :     {
    7486              :       /* Four operations.  */
    7487            4 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7488            4 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7489            4 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7490              : 
    7491            4 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7492            4 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7493              : 
    7494            4 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7495            4 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
    7496              : 
    7497            4 :       rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
    7498            4 :       rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
    7499            4 :       rtx tmp7 = gen_reg_rtx (V8HImode);
    7500            6 :       emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
    7501              :                                      GEN_INT (bits == 48 ? 0x1f : 0x07)));
    7502              : 
    7503            4 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
    7504            4 :       return;
    7505              :     }
    7506              : 
    7507           61 :   if ((bits & 7) == 0)
    7508              :     {
    7509              :       /* Five operations.  */
    7510            9 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7511            9 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7512            9 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7513              : 
    7514            9 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7515            9 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7516              : 
    7517            9 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7518            9 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
    7519              : 
    7520            9 :       rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7521            9 :       rtx tmp6 = gen_reg_rtx (V1TImode);
    7522            9 :       emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
    7523              : 
    7524            9 :       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7525            9 :       rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
    7526            9 :       rtx tmp9 = gen_reg_rtx (V2DImode);
    7527            9 :       emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
    7528              : 
    7529            9 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
    7530            9 :       return;
    7531              :     }
    7532              : 
    7533           52 :   if (TARGET_AVX2 && bits < 32)
    7534              :     {
    7535              :       /* Six operations.  */
    7536            9 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7537            9 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7538            9 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7539              : 
    7540            9 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7541            9 :       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
    7542              : 
    7543            9 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7544            9 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7545            9 :       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7546              : 
    7547            9 :       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7548            9 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7549            9 :       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
    7550              : 
    7551            9 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7552            9 :       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
    7553              : 
    7554            9 :       rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
    7555            9 :       rtx tmp10 = gen_reg_rtx (V4SImode);
    7556            9 :       emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
    7557              : 
    7558            9 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
    7559            9 :       return;
    7560              :     }
    7561              : 
    7562           43 :   if (TARGET_SSE4_1 && bits < 15)
    7563              :     {
    7564              :       /* Six operations.  */
    7565            4 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7566            4 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7567            4 :       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
    7568              : 
    7569            4 :       rtx tmp3 = gen_reg_rtx (V1TImode);
    7570            4 :       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
    7571              : 
    7572            4 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7573            4 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7574            4 :       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
    7575              : 
    7576            4 :       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7577            4 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7578            4 :       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
    7579              : 
    7580            4 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7581            4 :       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
    7582              : 
    7583            4 :       rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
    7584            4 :       rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
    7585            4 :       rtx tmp11 = gen_reg_rtx (V8HImode);
    7586            4 :       emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
    7587              : 
    7588            4 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
    7589            4 :       return;
    7590              :     }
    7591              : 
    7592           18 :   if (bits == 1)
    7593              :     {
    7594              :       /* Eight operations.  */
    7595            1 :       rtx tmp1 = gen_reg_rtx (V1TImode);
    7596            1 :       emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
    7597              : 
    7598            1 :       rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7599            1 :       rtx tmp3 = gen_reg_rtx (V2DImode);
    7600            1 :       emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
    7601              : 
    7602            1 :       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
    7603            1 :       rtx tmp5 = gen_reg_rtx (V2DImode);
    7604            1 :       emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
    7605              : 
    7606            1 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7607            1 :       emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
    7608              : 
    7609            1 :       rtx tmp7 = gen_reg_rtx (V2DImode);
    7610            1 :       emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
    7611              : 
    7612            1 :       rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
    7613            1 :       rtx tmp9 = gen_reg_rtx (V4SImode);
    7614            1 :       emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
    7615              : 
    7616            1 :       rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
    7617            1 :       rtx tmp11 = gen_reg_rtx (V2DImode);
    7618            1 :       emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
    7619              : 
    7620            1 :       rtx tmp12 = gen_reg_rtx (V2DImode);
    7621            1 :       emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
    7622              : 
    7623            1 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
    7624            1 :       return;
    7625              :     }
    7626              : 
    7627           38 :   if (bits > 64)
    7628              :     {
    7629              :       /* Eight operations.  */
    7630           12 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7631           12 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7632           12 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7633              : 
    7634           12 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7635           12 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7636              : 
    7637           12 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7638           12 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
    7639              : 
    7640           12 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7641           12 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7642           12 :       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
    7643              : 
    7644           12 :       rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7645           12 :       rtx tmp8 = gen_reg_rtx (V1TImode);
    7646           12 :       emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
    7647              : 
    7648           12 :       rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
    7649           12 :       rtx tmp10 = gen_reg_rtx (V2DImode);
    7650           12 :       emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
    7651              : 
    7652           12 :       rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
    7653           12 :       rtx tmp12 = gen_reg_rtx (V2DImode);
    7654           12 :       emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
    7655              : 
    7656           12 :       rtx tmp13 = gen_reg_rtx (V2DImode);
    7657           12 :       emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
    7658              : 
    7659           12 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
    7660              :     }
    7661              :   else
    7662              :     {
    7663              :       /* Nine operations.  */
    7664           26 :       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
    7665           26 :       rtx tmp2 = gen_reg_rtx (V4SImode);
    7666           26 :       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
    7667              : 
    7668           26 :       rtx tmp3 = gen_reg_rtx (V4SImode);
    7669           26 :       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
    7670              : 
    7671           26 :       rtx tmp4 = gen_reg_rtx (V1TImode);
    7672           26 :       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
    7673              : 
    7674           26 :       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
    7675           26 :       rtx tmp6 = gen_reg_rtx (V2DImode);
    7676           26 :       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
    7677              : 
    7678           26 :       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
    7679           26 :       rtx tmp8 = gen_reg_rtx (V2DImode);
    7680           26 :       emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
    7681              : 
    7682           26 :       rtx tmp9 = gen_reg_rtx (V2DImode);
    7683           26 :       emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
    7684              : 
    7685           26 :       rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
    7686           26 :       rtx tmp11 = gen_reg_rtx (V1TImode);
    7687           26 :       emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
    7688              : 
    7689           26 :       rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
    7690           26 :       rtx tmp13 = gen_reg_rtx (V2DImode);
    7691           26 :       emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
    7692              : 
    7693           26 :       rtx tmp14 = gen_reg_rtx (V2DImode);
    7694           26 :       emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
    7695              : 
    7696           26 :       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
    7697              :     }
    7698              : }
    7699              : 
    7700              : /* Expand V2DI mode ashiftrt.  */
    7701              : void
    7702          371 : ix86_expand_v2di_ashiftrt (rtx operands[])
    7703              : {
    7704          371 :   if (operands[2] == const0_rtx)
    7705              :     {
    7706            0 :       emit_move_insn (operands[0], operands[1]);
    7707            0 :       return;
    7708              :     }
    7709              : 
    7710          371 :   if (TARGET_SSE4_2
    7711          133 :       && CONST_INT_P (operands[2])
    7712          133 :       && UINTVAL (operands[2]) >= 63
    7713          379 :       && !optimize_insn_for_size_p ())
    7714              :     {
    7715            8 :       rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
    7716            8 :       emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
    7717            8 :       return;
    7718              :     }
    7719              : 
    7720          363 :   if (CONST_INT_P (operands[2])
    7721          349 :       && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
    7722              :     {
    7723          253 :       vec_perm_builder sel (4, 4, 1);
    7724          253 :       sel.quick_grow (4);
    7725          253 :       rtx arg0, arg1;
    7726          253 :       rtx op1 = lowpart_subreg (V4SImode,
    7727              :                                 force_reg (V2DImode, operands[1]),
    7728              :                                 V2DImode);
    7729          253 :       rtx target = gen_reg_rtx (V4SImode);
    7730          253 :       if (UINTVAL (operands[2]) >= 63)
    7731              :         {
    7732           87 :           arg0 = arg1 = gen_reg_rtx (V4SImode);
    7733           87 :           emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
    7734           87 :           sel[0] = 1;
    7735           87 :           sel[1] = 1;
    7736           87 :           sel[2] = 3;
    7737           87 :           sel[3] = 3;
    7738              :         }
    7739          166 :       else if (INTVAL (operands[2]) > 32)
    7740              :         {
    7741           15 :           arg0 = gen_reg_rtx (V4SImode);
    7742           15 :           arg1 = gen_reg_rtx (V4SImode);
    7743           15 :           emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
    7744           15 :           emit_insn (gen_ashrv4si3 (arg0, op1,
    7745           15 :                                     GEN_INT (INTVAL (operands[2]) - 32)));
    7746           15 :           sel[0] = 1;
    7747           15 :           sel[1] = 5;
    7748           15 :           sel[2] = 3;
    7749           15 :           sel[3] = 7;
    7750              :         }
    7751          151 :       else if (INTVAL (operands[2]) == 32)
    7752              :         {
    7753            3 :           arg0 = op1;
    7754            3 :           arg1 = gen_reg_rtx (V4SImode);
    7755            3 :           emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
    7756            3 :           sel[0] = 1;
    7757            3 :           sel[1] = 5;
    7758            3 :           sel[2] = 3;
    7759            3 :           sel[3] = 7;
    7760              :         }
    7761              :       else
    7762              :         {
    7763          148 :           arg0 = gen_reg_rtx (V2DImode);
    7764          148 :           arg1 = gen_reg_rtx (V4SImode);
    7765          148 :           emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
    7766          148 :           emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
    7767          148 :           arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
    7768          148 :           sel[0] = 0;
    7769          148 :           sel[1] = 5;
    7770          148 :           sel[2] = 2;
    7771          148 :           sel[3] = 7;
    7772              :         }
    7773          340 :       vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
    7774          253 :       rtx op0 = operands[0];
    7775          253 :       bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
    7776              :                                                   target, arg0, arg1,
    7777              :                                                   indices);
    7778          253 :       gcc_assert (ok);
    7779          253 :       emit_move_insn (op0, lowpart_subreg (V2DImode, target, V4SImode));
    7780          253 :       return;
    7781          253 :     }
    7782          110 :   if (!TARGET_XOP)
    7783              :     {
    7784           14 :       rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
    7785           14 :       rtx zero_or_all_ones;
    7786           14 :       if (TARGET_SSE4_2)
    7787              :         {
    7788            0 :           zero_or_all_ones = gen_reg_rtx (V2DImode);
    7789            0 :           emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
    7790              :                                          operands[1]));
    7791              :         }
    7792              :       else
    7793              :         {
    7794           14 :           rtx temp = gen_reg_rtx (V4SImode);
    7795           14 :           emit_insn (gen_ashrv4si3 (temp,
    7796              :                                     lowpart_subreg (V4SImode,
    7797              :                                                     force_reg (V2DImode,
    7798              :                                                                operands[1]),
    7799              :                                                     V2DImode),
    7800              :                                     GEN_INT (31)));
    7801           14 :           zero_or_all_ones = gen_reg_rtx (V4SImode);
    7802           14 :           emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
    7803              :                                         const1_rtx, const1_rtx,
    7804              :                                         GEN_INT (3), GEN_INT (3)));
    7805           14 :           zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
    7806              :                                              V4SImode);
    7807              :         }
    7808           14 :       rtx lshr_res = gen_reg_rtx (V2DImode);
    7809           14 :       emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
    7810           14 :       rtx ashl_res = gen_reg_rtx (V2DImode);
    7811           14 :       rtx amount;
    7812           14 :       if (TARGET_64BIT)
    7813              :         {
    7814           14 :           amount = gen_reg_rtx (DImode);
    7815           14 :           emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
    7816              :                                  operands[2]));
    7817              :         }
    7818              :       else
    7819              :         {
    7820            0 :           rtx temp = gen_reg_rtx (SImode);
    7821            0 :           emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
    7822              :                                  lowpart_subreg (SImode, operands[2],
    7823              :                                                  DImode)));
    7824            0 :           amount = gen_reg_rtx (V4SImode);
    7825            0 :           emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
    7826              :                                         temp));
    7827              :         }
    7828           14 :       amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
    7829           14 :       emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
    7830           14 :       emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
    7831           14 :       return;
    7832              :     }
    7833              : 
    7834           96 :   rtx reg = gen_reg_rtx (V2DImode);
    7835           96 :   rtx par;
    7836           96 :   bool negate = false;
    7837           96 :   int i;
    7838              : 
    7839           96 :   if (CONST_INT_P (operands[2]))
    7840           96 :     operands[2] = GEN_INT (-INTVAL (operands[2]));
    7841              :   else
    7842              :     negate = true;
    7843              : 
    7844           96 :   par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
    7845          288 :   for (i = 0; i < 2; i++)
    7846          192 :     XVECEXP (par, 0, i) = operands[2];
    7847              : 
    7848           96 :   emit_insn (gen_vec_initv2didi (reg, par));
    7849              : 
    7850           96 :   if (negate)
    7851            0 :     emit_insn (gen_negv2di2 (reg, reg));
    7852              : 
    7853           96 :   emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
    7854              : }
    7855              : 
    7856              : /* Replace all occurrences of REG FROM with REG TO in X, including
    7857              :    occurrences with different modes.  */
    7858              : 
    7859              : rtx
    7860        38659 : ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
    7861              : {
    7862        38659 :   gcc_checking_assert (REG_P (from)
    7863              :                        && REG_P (to)
    7864              :                        && GET_MODE (from) == GET_MODE (to));
    7865        38659 :   if (!reg_overlap_mentioned_p (from, x))
    7866              :     return x;
    7867          100 :   rtx ret = copy_rtx (x);
    7868          100 :   subrtx_ptr_iterator::array_type array;
    7869          488 :   FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
    7870              :     {
    7871          388 :       rtx *loc = *iter;
    7872          388 :       x = *loc;
    7873          388 :       if (REG_P (x) && REGNO (x) == REGNO (from))
    7874              :         {
    7875          100 :           if (x == from)
    7876          100 :             *loc = to;
    7877              :           else
    7878              :             {
    7879            0 :               gcc_checking_assert (REG_NREGS (x) == 1);
    7880            0 :               *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
    7881              :             }
    7882              :         }
    7883              :     }
    7884          100 :   return ret;
    7885          100 : }
    7886              : 
    7887              : /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
    7888              :    DImode for constant loop counts.  */
    7889              : 
    7890              : static machine_mode
    7891        33570 : counter_mode (rtx count_exp)
    7892              : {
    7893         7376 :   if (GET_MODE (count_exp) != VOIDmode)
    7894        26767 :     return GET_MODE (count_exp);
    7895         6803 :   if (!CONST_INT_P (count_exp))
    7896            0 :     return Pmode;
    7897              :   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
    7898              :     return DImode;
    7899              :   return SImode;
    7900              : }
    7901              : 
    7902              : /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
    7903              :    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
    7904              :    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
    7905              :    memory by VALUE (supposed to be in MODE).
    7906              : 
    7907              :    The size is rounded down to whole number of chunk size moved at once.
    7908              :    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
    7909              : 
    7910              : 
    7911              : static void
    7912        18537 : expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
    7913              :                                rtx destptr, rtx srcptr, rtx value,
    7914              :                                rtx count, machine_mode mode, int unroll,
    7915              :                                int expected_size, bool issetmem)
    7916              : {
    7917        18537 :   rtx_code_label *out_label = nullptr;
    7918        18537 :   rtx_code_label *top_label = nullptr;
    7919        18537 :   rtx iter, tmp;
    7920        18537 :   machine_mode iter_mode = counter_mode (count);
    7921        18537 :   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
    7922        18537 :   rtx piece_size = GEN_INT (piece_size_n);
    7923        37074 :   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
    7924        18537 :   rtx size;
    7925        18537 :   int i;
    7926        18537 :   int loop_count;
    7927              : 
    7928        18537 :   if (expected_size != -1 && CONST_INT_P (count))
    7929         6719 :     loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
    7930              :   else
    7931              :     loop_count = -1;
    7932              : 
    7933              :   /* Don't generate the loop if the loop count is 1.  */
    7934         6719 :   if (loop_count != 1)
    7935              :     {
    7936        18465 :       top_label = gen_label_rtx ();
    7937        18465 :       out_label = gen_label_rtx ();
    7938              :     }
    7939        18537 :   iter = gen_reg_rtx (iter_mode);
    7940              : 
    7941        18537 :   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
    7942              :                               NULL, 1, OPTAB_DIRECT);
    7943              :   /* Those two should combine.  */
    7944        18537 :   if (piece_size == const1_rtx)
    7945              :     {
    7946         4439 :       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
    7947              :                                true, out_label);
    7948         4439 :       predict_jump (REG_BR_PROB_BASE * 10 / 100);
    7949              :     }
    7950        18537 :   emit_move_insn (iter, const0_rtx);
    7951              : 
    7952        18537 :   if (loop_count != 1)
    7953        18465 :     emit_label (top_label);
    7954              : 
    7955        21298 :   tmp = convert_modes (Pmode, iter_mode, iter, true);
    7956              : 
    7957              :   /* This assert could be relaxed - in this case we'll need to compute
    7958              :      smallest power of two, containing in PIECE_SIZE_N and pass it to
    7959              :      offset_address.  */
    7960        18537 :   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
    7961        18537 :   destmem = offset_address (destmem, tmp, piece_size_n);
    7962        18537 :   destmem = adjust_address (destmem, mode, 0);
    7963              : 
    7964        18537 :   if (!issetmem)
    7965              :     {
    7966        12112 :       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
    7967        12112 :       srcmem = adjust_address (srcmem, mode, 0);
    7968              : 
    7969              :       /* When unrolling for chips that reorder memory reads and writes,
    7970              :          we can save registers by using single temporary.
    7971              :          Also using 4 temporaries is overkill in 32bit mode.  */
    7972        12112 :       if (!TARGET_64BIT && 0)
    7973              :         {
    7974              :           for (i = 0; i < unroll; i++)
    7975              :             {
    7976              :               if (i)
    7977              :                 {
    7978              :                   destmem = adjust_address (copy_rtx (destmem), mode,
    7979              :                                             GET_MODE_SIZE (mode));
    7980              :                   srcmem = adjust_address (copy_rtx (srcmem), mode,
    7981              :                                            GET_MODE_SIZE (mode));
    7982              :                 }
    7983              :               emit_move_insn (destmem, srcmem);
    7984              :             }
    7985              :         }
    7986              :       else
    7987              :         {
    7988        12112 :           rtx tmpreg[4];
    7989        12112 :           gcc_assert (unroll <= 4);
    7990        49549 :           for (i = 0; i < unroll; i++)
    7991              :             {
    7992        37437 :               tmpreg[i] = gen_reg_rtx (mode);
    7993        37437 :               if (i)
    7994        50650 :                 srcmem = adjust_address (copy_rtx (srcmem), mode,
    7995              :                                          GET_MODE_SIZE (mode));
    7996        37437 :               emit_move_insn (tmpreg[i], srcmem);
    7997              :             }
    7998        49549 :           for (i = 0; i < unroll; i++)
    7999              :             {
    8000        37437 :               if (i)
    8001        50650 :                 destmem = adjust_address (copy_rtx (destmem), mode,
    8002              :                                           GET_MODE_SIZE (mode));
    8003        37437 :               emit_move_insn (destmem, tmpreg[i]);
    8004              :             }
    8005              :         }
    8006              :     }
    8007              :   else
    8008        29668 :     for (i = 0; i < unroll; i++)
    8009              :       {
    8010        23243 :         if (i)
    8011        33636 :           destmem = adjust_address (copy_rtx (destmem), mode,
    8012              :                                     GET_MODE_SIZE (mode));
    8013        23243 :         emit_move_insn (destmem, value);
    8014              :       }
    8015              : 
    8016        18537 :   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
    8017              :                              true, OPTAB_LIB_WIDEN);
    8018        18537 :   if (tmp != iter)
    8019            0 :     emit_move_insn (iter, tmp);
    8020              : 
    8021        18537 :   if (loop_count != 1)
    8022              :     {
    8023        18465 :       emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
    8024              :                                true, top_label);
    8025        18465 :       if (expected_size != -1)
    8026              :         {
    8027         9024 :           expected_size /= GET_MODE_SIZE (mode) * unroll;
    8028         9024 :           if (expected_size == 0)
    8029            1 :             predict_jump (0);
    8030         9023 :           else if (expected_size > REG_BR_PROB_BASE)
    8031            2 :             predict_jump (REG_BR_PROB_BASE - 1);
    8032              :           else
    8033         9021 :             predict_jump (REG_BR_PROB_BASE
    8034         9021 :                           - (REG_BR_PROB_BASE + expected_size / 2)
    8035         9021 :                             / expected_size);
    8036              :         }
    8037              :       else
    8038         9441 :         predict_jump (REG_BR_PROB_BASE * 80 / 100);
    8039              :     }
    8040        18537 :   iter = ix86_zero_extend_to_Pmode (iter);
    8041        21298 :   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
    8042              :                              true, OPTAB_LIB_WIDEN);
    8043        18537 :   if (tmp != destptr)
    8044            0 :     emit_move_insn (destptr, tmp);
    8045        18537 :   if (!issetmem)
    8046              :     {
    8047        13449 :       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
    8048              :                                  true, OPTAB_LIB_WIDEN);
    8049        12112 :       if (tmp != srcptr)
    8050            0 :         emit_move_insn (srcptr, tmp);
    8051              :     }
    8052        18537 :   if (loop_count != 1)
    8053        18465 :     emit_label (out_label);
    8054        18537 : }
    8055              : 
    8056              : /* Divide COUNTREG by SCALE.  */
    8057              : static rtx
    8058        14595 : scale_counter (rtx countreg, int scale)
    8059              : {
    8060        14595 :   rtx sc;
    8061              : 
    8062        14595 :   if (scale == 1)
    8063              :     return countreg;
    8064         9389 :   if (CONST_INT_P (countreg))
    8065         9373 :     return GEN_INT (INTVAL (countreg) / scale);
    8066           16 :   gcc_assert (REG_P (countreg));
    8067              : 
    8068           48 :   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
    8069           32 :                             GEN_INT (exact_log2 (scale)),
    8070              :                             NULL, 1, OPTAB_DIRECT);
    8071           16 :   return sc;
    8072              : }
    8073              : 
    8074              : /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
    8075              :    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
    8076              :    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
    8077              :    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
    8078              :    ORIG_VALUE is the original value passed to memset to fill the memory with.
    8079              :    Other arguments have same meaning as for previous function.  */
    8080              : 
    8081              : static void
    8082        14595 : expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
    8083              :                            rtx destptr, rtx srcptr, rtx value, rtx orig_value,
    8084              :                            rtx count,
    8085              :                            machine_mode mode, bool issetmem)
    8086              : {
    8087        14595 :   rtx destexp;
    8088        14595 :   rtx srcexp;
    8089        14595 :   rtx countreg;
    8090        14595 :   HOST_WIDE_INT rounded_count;
    8091              : 
    8092              :   /* If possible, it is shorter to use rep movs.
    8093              :      TODO: Maybe it is better to move this logic to decide_alg.  */
    8094        14595 :   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
    8095          242 :       && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
    8096          238 :       && (!issetmem || orig_value == const0_rtx))
    8097        14595 :     mode = SImode;
    8098              : 
    8099        14595 :   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
    8100        14335 :     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
    8101              : 
    8102        29190 :   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
    8103        14595 :                                                        GET_MODE_SIZE (mode)));
    8104        14595 :   if (mode != QImode)
    8105              :     {
    8106        28427 :       destexp = gen_rtx_ASHIFT (Pmode, countreg,
    8107              :                                 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
    8108         9649 :       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
    8109              :     }
    8110              :   else
    8111         5228 :     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
    8112        14595 :   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
    8113              :     {
    8114        10064 :       rounded_count
    8115        10064 :         = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
    8116        10064 :       destmem = shallow_copy_rtx (destmem);
    8117        10064 :       set_mem_size (destmem, rounded_count);
    8118              :     }
    8119         4538 :   else if (MEM_SIZE_KNOWN_P (destmem))
    8120          333 :     clear_mem_size (destmem);
    8121              : 
    8122        14595 :   if (issetmem)
    8123              :     {
    8124         5348 :       value = force_reg (mode, gen_lowpart (mode, value));
    8125         5348 :       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
    8126              :     }
    8127              :   else
    8128              :     {
    8129         9247 :       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
    8130         9045 :         srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
    8131         9247 :       if (mode != QImode)
    8132              :         {
    8133        16252 :           srcexp = gen_rtx_ASHIFT (Pmode, countreg,
    8134              :                                    GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
    8135         5540 :           srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
    8136              :         }
    8137              :       else
    8138         3909 :         srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
    8139         9247 :       if (CONST_INT_P (count))
    8140              :         {
    8141         5855 :           rounded_count
    8142         5855 :             = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
    8143         5855 :           srcmem = shallow_copy_rtx (srcmem);
    8144         5855 :           set_mem_size (srcmem, rounded_count);
    8145              :         }
    8146              :       else
    8147              :         {
    8148         3406 :           if (MEM_SIZE_KNOWN_P (srcmem))
    8149            0 :             clear_mem_size (srcmem);
    8150              :         }
    8151         9247 :       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
    8152              :                               destexp, srcexp));
    8153              :     }
    8154        14595 : }
    8155              : 
    8156              : /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
    8157              :    DESTMEM.
    8158              :    SRC is passed by pointer to be updated on return.
    8159              :    Return value is updated DST.  */
    8160              : static rtx
    8161           13 : emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
    8162              :              HOST_WIDE_INT size_to_move)
    8163              : {
    8164           13 :   rtx dst = destmem, src = *srcmem, tempreg;
    8165           13 :   enum insn_code code;
    8166           13 :   machine_mode move_mode;
    8167           13 :   int piece_size, i;
    8168              : 
    8169              :   /* Find the widest mode in which we could perform moves.
    8170              :      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
    8171              :      it until move of such size is supported.  */
    8172           13 :   piece_size = 1 << floor_log2 (size_to_move);
    8173           26 :   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
    8174           26 :          || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
    8175              :     {
    8176            0 :       gcc_assert (piece_size > 1);
    8177            0 :       piece_size >>= 1;
    8178              :     }
    8179              : 
    8180              :   /* Find the corresponding vector mode with the same size as MOVE_MODE.
    8181              :      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
    8182           39 :   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
    8183              :     {
    8184            0 :       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
    8185            0 :       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
    8186            0 :           || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
    8187              :         {
    8188            0 :           move_mode = word_mode;
    8189            0 :           piece_size = GET_MODE_SIZE (move_mode);
    8190            0 :           code = optab_handler (mov_optab, move_mode);
    8191              :         }
    8192              :     }
    8193           13 :   gcc_assert (code != CODE_FOR_nothing);
    8194              : 
    8195           13 :   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
    8196           13 :   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
    8197              : 
    8198              :   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
    8199           13 :   gcc_assert (size_to_move % piece_size == 0);
    8200              : 
    8201           26 :   for (i = 0; i < size_to_move; i += piece_size)
    8202              :     {
    8203              :       /* We move from memory to memory, so we'll need to do it via
    8204              :          a temporary register.  */
    8205           13 :       tempreg = gen_reg_rtx (move_mode);
    8206           13 :       emit_insn (GEN_FCN (code) (tempreg, src));
    8207           13 :       emit_insn (GEN_FCN (code) (dst, tempreg));
    8208              : 
    8209           26 :       emit_move_insn (destptr,
    8210           13 :                       plus_constant (Pmode, copy_rtx (destptr), piece_size));
    8211           26 :       emit_move_insn (srcptr,
    8212           13 :                       plus_constant (Pmode, copy_rtx (srcptr), piece_size));
    8213              : 
    8214           13 :       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8215              :                                           piece_size);
    8216           13 :       src = adjust_automodify_address_nv (src, move_mode, srcptr,
    8217              :                                           piece_size);
    8218              :     }
    8219              : 
    8220              :   /* Update DST and SRC rtx.  */
    8221           13 :   *srcmem = src;
    8222           13 :   return dst;
    8223              : }
    8224              : 
    8225              : /* Helper function for the string operations below.  Dest VARIABLE whether
    8226              :    it is aligned to VALUE bytes.  If true, jump to the label.  */
    8227              : 
    8228              : static rtx_code_label *
    8229        35255 : ix86_expand_aligntest (rtx variable, int value, bool epilogue)
    8230              : {
    8231        35255 :   rtx_code_label *label = gen_label_rtx ();
    8232        35255 :   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
    8233        35255 :   if (GET_MODE (variable) == DImode)
    8234          897 :     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
    8235              :   else
    8236        34358 :     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
    8237        35255 :   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
    8238              :                            1, label);
    8239        35255 :   if (epilogue)
    8240            3 :     predict_jump (REG_BR_PROB_BASE * 50 / 100);
    8241              :   else
    8242        35252 :     predict_jump (REG_BR_PROB_BASE * 90 / 100);
    8243        35255 :   return label;
    8244              : }
    8245              : 
    8246              : 
    8247              : /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
    8248              : 
    8249              : static void
    8250         7818 : expand_cpymem_epilogue (rtx destmem, rtx srcmem,
    8251              :                         rtx destptr, rtx srcptr, rtx count, int max_size)
    8252              : {
    8253         7818 :   rtx src, dest;
    8254         7818 :   if (CONST_INT_P (count))
    8255              :     {
    8256         5776 :       unsigned HOST_WIDE_INT countval = UINTVAL (count);
    8257         5776 :       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
    8258         5776 :       unsigned int destalign = MEM_ALIGN (destmem);
    8259         5776 :       cfun->machine->by_pieces_in_use = true;
    8260         5776 :       move_by_pieces (destmem, srcmem, epilogue_size, destalign,
    8261              :                       RETURN_BEGIN);
    8262         5776 :       cfun->machine->by_pieces_in_use = false;
    8263         5776 :       return;
    8264              :     }
    8265         2042 :   if (max_size > 8)
    8266              :     {
    8267         2042 :       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
    8268              :                                     count, 1, OPTAB_DIRECT);
    8269         2042 :       expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
    8270              :                                      count, QImode, 1, 4, false);
    8271         2042 :       return;
    8272              :     }
    8273              : 
    8274              :   /* When there are stringops, we can cheaply increase dest and src pointers.
    8275              :      Otherwise we save code size by maintaining offset (zero is readily
    8276              :      available from preceding rep operation) and using x86 addressing modes.
    8277              :    */
    8278            0 :   if (TARGET_SINGLE_STRINGOP)
    8279              :     {
    8280            0 :       if (max_size > 4)
    8281              :         {
    8282            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8283            0 :           src = change_address (srcmem, SImode, srcptr);
    8284            0 :           dest = change_address (destmem, SImode, destptr);
    8285            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8286            0 :           emit_label (label);
    8287            0 :           LABEL_NUSES (label) = 1;
    8288              :         }
    8289            0 :       if (max_size > 2)
    8290              :         {
    8291            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8292            0 :           src = change_address (srcmem, HImode, srcptr);
    8293            0 :           dest = change_address (destmem, HImode, destptr);
    8294            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8295            0 :           emit_label (label);
    8296            0 :           LABEL_NUSES (label) = 1;
    8297              :         }
    8298            0 :       if (max_size > 1)
    8299              :         {
    8300            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8301            0 :           src = change_address (srcmem, QImode, srcptr);
    8302            0 :           dest = change_address (destmem, QImode, destptr);
    8303            0 :           emit_insn (gen_strmov (destptr, dest, srcptr, src));
    8304            0 :           emit_label (label);
    8305            0 :           LABEL_NUSES (label) = 1;
    8306              :         }
    8307              :     }
    8308              :   else
    8309              :     {
    8310            0 :       rtx offset = force_reg (Pmode, const0_rtx);
    8311            0 :       rtx tmp;
    8312              : 
    8313            0 :       if (max_size > 4)
    8314              :         {
    8315            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8316            0 :           src = change_address (srcmem, SImode, srcptr);
    8317            0 :           dest = change_address (destmem, SImode, destptr);
    8318            0 :           emit_move_insn (dest, src);
    8319            0 :           tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
    8320              :                                      true, OPTAB_LIB_WIDEN);
    8321            0 :           if (tmp != offset)
    8322            0 :             emit_move_insn (offset, tmp);
    8323            0 :           emit_label (label);
    8324            0 :           LABEL_NUSES (label) = 1;
    8325              :         }
    8326            0 :       if (max_size > 2)
    8327              :         {
    8328            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8329            0 :           tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
    8330            0 :           src = change_address (srcmem, HImode, tmp);
    8331            0 :           tmp = gen_rtx_PLUS (Pmode, destptr, offset);
    8332            0 :           dest = change_address (destmem, HImode, tmp);
    8333            0 :           emit_move_insn (dest, src);
    8334            0 :           tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
    8335              :                                      true, OPTAB_LIB_WIDEN);
    8336            0 :           if (tmp != offset)
    8337            0 :             emit_move_insn (offset, tmp);
    8338            0 :           emit_label (label);
    8339            0 :           LABEL_NUSES (label) = 1;
    8340              :         }
    8341            0 :       if (max_size > 1)
    8342              :         {
    8343            0 :           rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8344            0 :           tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
    8345            0 :           src = change_address (srcmem, QImode, tmp);
    8346            0 :           tmp = gen_rtx_PLUS (Pmode, destptr, offset);
    8347            0 :           dest = change_address (destmem, QImode, tmp);
    8348            0 :           emit_move_insn (dest, src);
    8349            0 :           emit_label (label);
    8350            0 :           LABEL_NUSES (label) = 1;
    8351              :         }
    8352              :     }
    8353              : }
    8354              : 
    8355              : /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
    8356              :    with value PROMOTED_VAL.
    8357              :    SRC is passed by pointer to be updated on return.
    8358              :    Return value is updated DST.  */
    8359              : static rtx
    8360            6 : emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
    8361              :              HOST_WIDE_INT size_to_move)
    8362              : {
    8363            6 :   rtx dst = destmem;
    8364            6 :   enum insn_code code;
    8365            6 :   machine_mode move_mode;
    8366            6 :   int piece_size, i;
    8367              : 
    8368              :   /* Find the widest mode in which we could perform moves.
    8369              :      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
    8370              :      it until move of such size is supported.  */
    8371            6 :   move_mode = GET_MODE (promoted_val);
    8372            6 :   if (move_mode == VOIDmode)
    8373            0 :     move_mode = QImode;
    8374           12 :   if (size_to_move < GET_MODE_SIZE (move_mode))
    8375              :     {
    8376            5 :       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
    8377            5 :       move_mode = int_mode_for_size (move_bits, 0).require ();
    8378            5 :       promoted_val = gen_lowpart (move_mode, promoted_val);
    8379              :     }
    8380            6 :   piece_size = GET_MODE_SIZE (move_mode);
    8381            6 :   code = optab_handler (mov_optab, move_mode);
    8382            6 :   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
    8383              : 
    8384            6 :   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
    8385              : 
    8386              :   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
    8387            6 :   gcc_assert (size_to_move % piece_size == 0);
    8388              : 
    8389           12 :   for (i = 0; i < size_to_move; i += piece_size)
    8390              :     {
    8391           12 :       if (piece_size <= GET_MODE_SIZE (word_mode))
    8392              :         {
    8393            4 :           emit_insn (gen_strset (destptr, dst, promoted_val));
    8394            4 :           dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8395              :                                               piece_size);
    8396            4 :           continue;
    8397              :         }
    8398              : 
    8399            2 :       emit_insn (GEN_FCN (code) (dst, promoted_val));
    8400              : 
    8401            4 :       emit_move_insn (destptr,
    8402            2 :                       plus_constant (Pmode, copy_rtx (destptr), piece_size));
    8403              : 
    8404            2 :       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
    8405              :                                           piece_size);
    8406              :     }
    8407              : 
    8408              :   /* Update DST rtx.  */
    8409            6 :   return dst;
    8410              : }
    8411              : /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
    8412              : static void
    8413          325 : expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
    8414              :                                  rtx count, int max_size)
    8415              : {
    8416          650 :   count = expand_simple_binop (counter_mode (count), AND, count,
    8417          325 :                                GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
    8418          325 :   expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
    8419          325 :                                  gen_lowpart (QImode, value), count, QImode,
    8420              :                                  1, max_size / 2, true);
    8421          325 : }
    8422              : 
    8423              : /* Callback routine for store_by_pieces.  Return the RTL of a register
    8424              :    containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
    8425              :    is an integer or a word vector register.  If PREV_P isn't nullptr,
    8426              :    it has the RTL info from the previous iteration.  */
    8427              : 
    8428              : static rtx
    8429         4993 : setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
    8430              :                          fixed_size_mode mode)
    8431              : {
    8432         4993 :   rtx target;
    8433         4993 :   by_pieces_prev *prev = (by_pieces_prev *) prev_p;
    8434         4993 :   if (prev)
    8435              :     {
    8436         4993 :       rtx prev_op = prev->data;
    8437         4993 :       if (prev_op)
    8438              :         {
    8439         2890 :           machine_mode prev_mode = GET_MODE (prev_op);
    8440         2890 :           if (prev_mode == mode)
    8441              :             return prev_op;
    8442           54 :           if (VECTOR_MODE_P (prev_mode)
    8443         1097 :               && VECTOR_MODE_P (mode)
    8444         1151 :               && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
    8445              :             {
    8446            0 :               target = gen_rtx_SUBREG (mode, prev_op, 0);
    8447            0 :               return target;
    8448              :             }
    8449              :         }
    8450              :     }
    8451              : 
    8452         3254 :   rtx op = (rtx) op_p;
    8453         3254 :   machine_mode op_mode = GET_MODE (op);
    8454              : 
    8455         3254 :   if (VECTOR_MODE_P (mode))
    8456              :     {
    8457         3678 :       gcc_assert (GET_MODE_INNER (mode) == QImode);
    8458              : 
    8459         1839 :       unsigned int op_size = GET_MODE_SIZE (op_mode);
    8460         1839 :       unsigned int size = GET_MODE_SIZE (mode);
    8461         1839 :       unsigned int nunits;
    8462         1839 :       machine_mode vec_mode;
    8463         1839 :       if (op_size < size)
    8464              :         {
    8465              :           /* If OP size is smaller than MODE size, duplicate it.  */
    8466            1 :           nunits = size / GET_MODE_SIZE (QImode);
    8467            1 :           vec_mode = mode_for_vector (QImode, nunits).require ();
    8468            1 :           nunits = size / op_size;
    8469            1 :           gcc_assert (SCALAR_INT_MODE_P (op_mode));
    8470            1 :           machine_mode dup_mode
    8471            1 :             = mode_for_vector (as_a <scalar_mode> (op_mode),
    8472            2 :                                nunits).require ();
    8473            1 :           target = gen_reg_rtx (vec_mode);
    8474            1 :           op = gen_vec_duplicate (dup_mode, op);
    8475            1 :           rtx dup_op = gen_reg_rtx (dup_mode);
    8476            1 :           emit_move_insn (dup_op, op);
    8477            1 :           op = gen_rtx_SUBREG (vec_mode, dup_op, 0);
    8478            1 :           emit_move_insn (target, op);
    8479            1 :           return target;
    8480              :         }
    8481         1838 :       nunits = op_size / GET_MODE_SIZE (QImode);
    8482         1838 :       vec_mode = mode_for_vector (QImode, nunits).require ();
    8483         1838 :       target = gen_reg_rtx (vec_mode);
    8484         1838 :       op = gen_rtx_SUBREG (vec_mode, op, 0);
    8485         1838 :       emit_move_insn (target, op);
    8486         1838 :       if (op_size == size)
    8487              :         return target;
    8488              : 
    8489            0 :       rtx tmp = gen_reg_rtx (mode);
    8490            0 :       target = gen_rtx_SUBREG (mode, target, 0);
    8491            0 :       emit_move_insn (tmp, target);
    8492            0 :       return tmp;
    8493              :     }
    8494              : 
    8495         1415 :   if (VECTOR_MODE_P (op_mode))
    8496              :     {
    8497         2820 :       gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
    8498         1410 :       target = gen_reg_rtx (word_mode);
    8499         1410 :       op = gen_rtx_SUBREG (word_mode, op, 0);
    8500         1410 :       emit_move_insn (target, op);
    8501              :     }
    8502              :   else
    8503              :     target = op;
    8504              : 
    8505         1415 :   if (mode == GET_MODE (target))
    8506              :     return target;
    8507              : 
    8508          241 :   rtx tmp = gen_reg_rtx (mode);
    8509          241 :   target = gen_rtx_SUBREG (mode, target, 0);
    8510          241 :   emit_move_insn (tmp, target);
    8511          241 :   return tmp;
    8512              : }
    8513              : 
    8514              : /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
    8515              : static void
    8516         7344 : expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
    8517              :                         rtx count, int max_size)
    8518              : {
    8519         7344 :   rtx dest;
    8520              : 
    8521         7344 :   if (CONST_INT_P (count))
    8522              :     {
    8523         7018 :       unsigned HOST_WIDE_INT countval = UINTVAL (count);
    8524         7018 :       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
    8525         7018 :       unsigned int destalign = MEM_ALIGN (destmem);
    8526         7018 :       cfun->machine->by_pieces_in_use = true;
    8527        11004 :       store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
    8528              :                        vec_value ? vec_value : value, destalign, true,
    8529              :                        RETURN_BEGIN);
    8530         7018 :       cfun->machine->by_pieces_in_use = false;
    8531         7018 :       return;
    8532              :     }
    8533          326 :   if (max_size > 32)
    8534              :     {
    8535          325 :       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
    8536          325 :       return;
    8537              :     }
    8538            1 :   if (max_size > 16)
    8539              :     {
    8540            0 :       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
    8541            0 :       if (TARGET_64BIT)
    8542              :         {
    8543            0 :           dest = change_address (destmem, DImode, destptr);
    8544            0 :           emit_insn (gen_strset (destptr, dest, value));
    8545            0 :           dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
    8546            0 :           emit_insn (gen_strset (destptr, dest, value));
    8547              :         }
    8548              :       else
    8549              :         {
    8550            0 :           dest = change_address (destmem, SImode, destptr);
    8551            0 :           emit_insn (gen_strset (destptr, dest, value));
    8552            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
    8553            0 :           emit_insn (gen_strset (destptr, dest, value));
    8554            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
    8555            0 :           emit_insn (gen_strset (destptr, dest, value));
    8556            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
    8557            0 :           emit_insn (gen_strset (destptr, dest, value));
    8558              :         }
    8559            0 :       emit_label (label);
    8560            0 :       LABEL_NUSES (label) = 1;
    8561              :     }
    8562            1 :   if (max_size > 8)
    8563              :     {
    8564            0 :       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
    8565            0 :       if (TARGET_64BIT)
    8566              :         {
    8567            0 :           dest = change_address (destmem, DImode, destptr);
    8568            0 :           emit_insn (gen_strset (destptr, dest, value));
    8569              :         }
    8570              :       else
    8571              :         {
    8572            0 :           dest = change_address (destmem, SImode, destptr);
    8573            0 :           emit_insn (gen_strset (destptr, dest, value));
    8574            0 :           dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
    8575            0 :           emit_insn (gen_strset (destptr, dest, value));
    8576              :         }
    8577            0 :       emit_label (label);
    8578            0 :       LABEL_NUSES (label) = 1;
    8579              :     }
    8580            1 :   if (max_size > 4)
    8581              :     {
    8582            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
    8583            1 :       dest = change_address (destmem, SImode, destptr);
    8584            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
    8585            1 :       emit_label (label);
    8586            1 :       LABEL_NUSES (label) = 1;
    8587              :     }
    8588            1 :   if (max_size > 2)
    8589              :     {
    8590            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
    8591            1 :       dest = change_address (destmem, HImode, destptr);
    8592            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
    8593            1 :       emit_label (label);
    8594            1 :       LABEL_NUSES (label) = 1;
    8595              :     }
    8596            1 :   if (max_size > 1)
    8597              :     {
    8598            1 :       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
    8599            1 :       dest = change_address (destmem, QImode, destptr);
    8600            1 :       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
    8601            1 :       emit_label (label);
    8602            1 :       LABEL_NUSES (label) = 1;
    8603              :     }
    8604              : }
    8605              : 
    8606              : /* Adjust COUNTER by the VALUE.  */
    8607              : static void
    8608           19 : ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
    8609              : {
    8610           19 :   emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
    8611           19 : }
    8612              : 
    8613              : /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
    8614              :    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
    8615              :    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
    8616              :    ignored.
    8617              :    Return value is updated DESTMEM.  */
    8618              : 
    8619              : static rtx
    8620            7 : expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
    8621              :                                   rtx destptr, rtx srcptr, rtx value,
    8622              :                                   rtx vec_value, rtx count, int align,
    8623              :                                   int desired_alignment, bool issetmem)
    8624              : {
    8625            7 :   int i;
    8626           35 :   for (i = 1; i < desired_alignment; i <<= 1)
    8627              :     {
    8628           28 :       if (align <= i)
    8629              :         {
    8630           19 :           rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
    8631           19 :           if (issetmem)
    8632              :             {
    8633           12 :               if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
    8634            2 :                 destmem = emit_memset (destmem, destptr, vec_value, i);
    8635              :               else
    8636            4 :                 destmem = emit_memset (destmem, destptr, value, i);
    8637              :             }
    8638              :           else
    8639           13 :             destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
    8640           19 :           ix86_adjust_counter (count, i);
    8641           19 :           emit_label (label);
    8642           19 :           LABEL_NUSES (label) = 1;
    8643           19 :           set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
    8644              :         }
    8645              :     }
    8646            7 :   return destmem;
    8647              : }
    8648              : 
    8649              : /* Test if COUNT&SIZE is nonzero and if so, expand movme
    8650              :    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
    8651              :    and jump to DONE_LABEL.  */
    8652              : static void
    8653        28182 : expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
    8654              :                                rtx destptr, rtx srcptr,
    8655              :                                rtx value, rtx vec_value,
    8656              :                                rtx count, int size,
    8657              :                                rtx done_label, bool issetmem)
    8658              : {
    8659        28182 :   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
    8660        28182 :   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
    8661        28182 :   rtx modesize;
    8662        28182 :   rtx scalar_value = value;
    8663        28182 :   int n;
    8664              : 
    8665              :   /* If we do not have vector value to copy, we must reduce size.  */
    8666        28182 :   if (issetmem)
    8667              :     {
    8668         3680 :       if (!vec_value)
    8669              :         {
    8670            7 :           if (GET_MODE (value) == VOIDmode && size > 8)
    8671            0 :             mode = Pmode;
    8672           21 :           else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
    8673            1 :             mode = GET_MODE (value);
    8674              :         }
    8675              :       else
    8676         3673 :         mode = GET_MODE (vec_value), value = vec_value;
    8677              :     }
    8678              :   else
    8679              :     {
    8680              :       /* Choose appropriate vector mode.  */
    8681        24502 :       if (size >= 32)
    8682         6124 :         switch (MOVE_MAX)
    8683              :           {
    8684            0 :           case 64:
    8685            0 :             if (size >= 64)
    8686              :               {
    8687              :                 mode = V64QImode;
    8688              :                 break;
    8689              :               }
    8690              :             /* FALLTHRU */
    8691            0 :           case 32:
    8692            0 :             mode = V32QImode;
    8693            0 :             break;
    8694              :           case 16:
    8695              :             mode = V16QImode;
    8696              :             break;
    8697              :           case 8:
    8698              :             mode = DImode;
    8699              :             break;
    8700            0 :           default:
    8701            0 :             gcc_unreachable ();
    8702              :           }
    8703        18378 :       else if (size >= 16)
    8704         6124 :         mode = TARGET_SSE ? V16QImode : DImode;
    8705        24502 :       srcmem = change_address (srcmem, mode, srcptr);
    8706              :     }
    8707        31855 :   if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
    8708              :     {
    8709              :       /* For memset with vector and the size is smaller than the vector
    8710              :          size, first try the narrower vector, otherwise, use the
    8711              :          original value. */
    8712         1841 :       machine_mode inner_mode = GET_MODE_INNER (mode);
    8713         1841 :       unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
    8714         1841 :       if (nunits > 1)
    8715              :         {
    8716          364 :           mode = mode_for_vector (GET_MODE_INNER (mode),
    8717          364 :                                   nunits).require ();
    8718          182 :           value = gen_rtx_SUBREG (mode, value, 0);
    8719              :         }
    8720              :       else
    8721              :         {
    8722         1659 :           scalar_int_mode smode
    8723         1659 :             = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
    8724         4977 :           gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
    8725              :                       >= GET_MODE_SIZE (smode));
    8726         1659 :           mode = smode;
    8727         1659 :           if (GET_MODE (scalar_value) == mode)
    8728              :             value = scalar_value;
    8729              :           else
    8730          743 :             value = gen_rtx_SUBREG (mode, scalar_value, 0);
    8731              :         }
    8732              :     }
    8733        28182 :   destmem = change_address (destmem, mode, destptr);
    8734        56364 :   modesize = GEN_INT (GET_MODE_SIZE (mode));
    8735        56364 :   gcc_assert (GET_MODE_SIZE (mode) <= size);
    8736       126810 :   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
    8737              :     {
    8738        35223 :       if (issetmem)
    8739         4597 :         emit_move_insn (destmem, gen_lowpart (mode, value));
    8740              :       else
    8741              :         {
    8742        30626 :           emit_move_insn (destmem, srcmem);
    8743        61252 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8744              :         }
    8745        70446 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8746              :     }
    8747              : 
    8748        28182 :   destmem = offset_address (destmem, count, 1);
    8749        56364 :   destmem = offset_address (destmem, GEN_INT (-2 * size),
    8750        28182 :                             GET_MODE_SIZE (mode));
    8751        28182 :   if (!issetmem)
    8752              :     {
    8753        24502 :       srcmem = offset_address (srcmem, count, 1);
    8754        49004 :       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
    8755        24502 :                                GET_MODE_SIZE (mode));
    8756              :     }
    8757       126810 :   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
    8758              :     {
    8759        35223 :       if (issetmem)
    8760         4597 :         emit_move_insn (destmem, gen_lowpart (mode, value));
    8761              :       else
    8762              :         {
    8763        30626 :           emit_move_insn (destmem, srcmem);
    8764        61252 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8765              :         }
    8766        70446 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8767              :     }
    8768        28182 :   emit_jump_insn (gen_jump (done_label));
    8769        28182 :   emit_barrier ();
    8770              : 
    8771        28182 :   emit_label (label);
    8772        28182 :   LABEL_NUSES (label) = 1;
    8773        28182 : }
    8774              : 
    8775              : /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
    8776              :    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
    8777              :    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
    8778              :    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
    8779              :    DONE_LABEL is a label after the whole copying sequence. The label is created
    8780              :    on demand if *DONE_LABEL is NULL.
    8781              :    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
    8782              :    bounds after the initial copies.
    8783              : 
    8784              :    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
    8785              :    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
    8786              :    we will dispatch to a library call for large blocks.
    8787              : 
    8788              :    In pseudocode we do:
    8789              : 
    8790              :    if (COUNT < SIZE)
    8791              :      {
    8792              :        Assume that SIZE is 4. Bigger sizes are handled analogously
    8793              :        if (COUNT & 4)
    8794              :          {
    8795              :             copy 4 bytes from SRCPTR to DESTPTR
    8796              :             copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
    8797              :             goto done_label
    8798              :          }
    8799              :        if (!COUNT)
    8800              :          goto done_label;
    8801              :        copy 1 byte from SRCPTR to DESTPTR
    8802              :        if (COUNT & 2)
    8803              :          {
    8804              :             copy 2 bytes from SRCPTR to DESTPTR
    8805              :             copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
    8806              :          }
    8807              :      }
    8808              :    else
    8809              :      {
    8810              :        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
    8811              :        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
    8812              : 
    8813              :        OLD_DESPTR = DESTPTR;
    8814              :        Align DESTPTR up to DESIRED_ALIGN
    8815              :        SRCPTR += DESTPTR - OLD_DESTPTR
    8816              :        COUNT -= DEST_PTR - OLD_DESTPTR
    8817              :        if (DYNAMIC_CHECK)
    8818              :          Round COUNT down to multiple of SIZE
    8819              :        << optional caller supplied zero size guard is here >>
    8820              :        << optional caller supplied dynamic check is here >>
    8821              :        << caller supplied main copy loop is here >>
    8822              :      }
    8823              :    done_label:
    8824              :   */
    8825              : static void
    8826        10444 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
    8827              :                                                             rtx *destptr, rtx *srcptr,
    8828              :                                                             machine_mode mode,
    8829              :                                                             rtx value, rtx vec_value,
    8830              :                                                             rtx *count,
    8831              :                                                             rtx_code_label **done_label,
    8832              :                                                             int size,
    8833              :                                                             int desired_align,
    8834              :                                                             int align,
    8835              :                                                             unsigned HOST_WIDE_INT *min_size,
    8836              :                                                             bool dynamic_check,
    8837              :                                                             bool issetmem)
    8838              : {
    8839        10444 :   rtx_code_label *loop_label = NULL, *label;
    8840        10444 :   int n;
    8841        10444 :   rtx modesize;
    8842        10444 :   int prolog_size = 0;
    8843        10444 :   rtx mode_value;
    8844              : 
    8845              :   /* Chose proper value to copy.  */
    8846        10444 :   if (issetmem && VECTOR_MODE_P (mode))
    8847              :     mode_value = vec_value;
    8848              :   else
    8849        10444 :     mode_value = value;
    8850        20888 :   gcc_assert (GET_MODE_SIZE (mode) <= size);
    8851              : 
    8852              :   /* See if block is big or small, handle small blocks.  */
    8853        10444 :   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
    8854              :     {
    8855         7051 :       int size2 = size;
    8856         7051 :       loop_label = gen_label_rtx ();
    8857              : 
    8858         7051 :       if (!*done_label)
    8859         7051 :         *done_label = gen_label_rtx ();
    8860              : 
    8861         7051 :       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
    8862              :                                1, loop_label);
    8863         7051 :       size2 >>= 1;
    8864              : 
    8865              :       /* Handle sizes > 3.  */
    8866        35233 :       for (;size2 > 2; size2 >>= 1)
    8867        28182 :         expand_small_cpymem_or_setmem (destmem, srcmem,
    8868              :                                        *destptr, *srcptr,
    8869              :                                        value, vec_value,
    8870              :                                        *count,
    8871              :                                        size2, *done_label, issetmem);
    8872              :       /* Nothing to copy?  Jump to DONE_LABEL if so */
    8873         7051 :       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
    8874              :                                1, *done_label);
    8875              : 
    8876              :       /* Do a byte copy.  */
    8877         7051 :       destmem = change_address (destmem, QImode, *destptr);
    8878         7051 :       if (issetmem)
    8879          921 :         emit_move_insn (destmem, gen_lowpart (QImode, value));
    8880              :       else
    8881              :         {
    8882         6130 :           srcmem = change_address (srcmem, QImode, *srcptr);
    8883         6130 :           emit_move_insn (destmem, srcmem);
    8884              :         }
    8885              : 
    8886              :       /* Handle sizes 2 and 3.  */
    8887         7051 :       label = ix86_expand_aligntest (*count, 2, false);
    8888         7051 :       destmem = change_address (destmem, HImode, *destptr);
    8889         7051 :       destmem = offset_address (destmem, *count, 1);
    8890         7051 :       destmem = offset_address (destmem, GEN_INT (-2), 2);
    8891         7051 :       if (issetmem)
    8892          921 :         emit_move_insn (destmem, gen_lowpart (HImode, value));
    8893              :       else
    8894              :         {
    8895         6130 :           srcmem = change_address (srcmem, HImode, *srcptr);
    8896         6130 :           srcmem = offset_address (srcmem, *count, 1);
    8897         6130 :           srcmem = offset_address (srcmem, GEN_INT (-2), 2);
    8898         6130 :           emit_move_insn (destmem, srcmem);
    8899              :         }
    8900              : 
    8901         7051 :       emit_label (label);
    8902         7051 :       LABEL_NUSES (label) = 1;
    8903         7051 :       emit_jump_insn (gen_jump (*done_label));
    8904         7051 :       emit_barrier ();
    8905              :     }
    8906              :   else
    8907         3393 :     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
    8908              :                 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
    8909              : 
    8910              :   /* Start memcpy for COUNT >= SIZE.  */
    8911         7051 :   if (loop_label)
    8912              :     {
    8913         7051 :        emit_label (loop_label);
    8914         7051 :        LABEL_NUSES (loop_label) = 1;
    8915              :     }
    8916              : 
    8917              :   /* Copy first desired_align bytes.  */
    8918        10444 :   if (!issetmem)
    8919         7843 :     srcmem = change_address (srcmem, mode, *srcptr);
    8920        10444 :   destmem = change_address (destmem, mode, *destptr);
    8921        10444 :   modesize = GEN_INT (GET_MODE_SIZE (mode));
    8922        20909 :   for (n = 0; prolog_size < desired_align - align; n++)
    8923              :     {
    8924           21 :       if (issetmem)
    8925            3 :         emit_move_insn (destmem, mode_value);
    8926              :       else
    8927              :         {
    8928           18 :           emit_move_insn (destmem, srcmem);
    8929           36 :           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
    8930              :         }
    8931           42 :       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
    8932           42 :       prolog_size += GET_MODE_SIZE (mode);
    8933              :     }
    8934              : 
    8935              : 
    8936              :   /* Copy last SIZE bytes.  */
    8937        10444 :   destmem = offset_address (destmem, *count, 1);
    8938        10444 :   destmem = offset_address (destmem,
    8939        10444 :                             GEN_INT (-size - prolog_size),
    8940              :                             1);
    8941        10444 :   if (issetmem)
    8942         2601 :     emit_move_insn (destmem, mode_value);
    8943              :   else
    8944              :     {
    8945         7843 :       srcmem = offset_address (srcmem, *count, 1);
    8946         7843 :       srcmem = offset_address (srcmem,
    8947              :                                GEN_INT (-size - prolog_size),
    8948              :                                1);
    8949         7843 :       emit_move_insn (destmem, srcmem);
    8950              :     }
    8951        81938 :   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
    8952              :     {
    8953        30525 :       destmem = offset_address (destmem, modesize, 1);
    8954        30525 :       if (issetmem)
    8955         7620 :         emit_move_insn (destmem, mode_value);
    8956              :       else
    8957              :         {
    8958        22905 :           srcmem = offset_address (srcmem, modesize, 1);
    8959        22905 :           emit_move_insn (destmem, srcmem);
    8960              :         }
    8961              :     }
    8962              : 
    8963              :   /* Align destination.  */
    8964        10444 :   if (desired_align > 1 && desired_align > align)
    8965              :     {
    8966           21 :       rtx saveddest = *destptr;
    8967              : 
    8968           21 :       gcc_assert (desired_align <= size);
    8969              :       /* Align destptr up, place it to new register.  */
    8970           21 :       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
    8971              :                                       GEN_INT (prolog_size),
    8972              :                                       NULL_RTX, 1, OPTAB_DIRECT);
    8973           21 :       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
    8974           21 :         REG_POINTER (*destptr) = 1;
    8975           21 :       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
    8976           21 :                                       GEN_INT (-desired_align),
    8977              :                                       *destptr, 1, OPTAB_DIRECT);
    8978              :       /* See how many bytes we skipped.  */
    8979           21 :       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
    8980              :                                        *destptr,
    8981              :                                        NULL_RTX, 1, OPTAB_DIRECT);
    8982              :       /* Adjust srcptr and count.  */
    8983           21 :       if (!issetmem)
    8984           18 :         *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
    8985              :                                        saveddest, *srcptr, 1, OPTAB_DIRECT);
    8986           21 :       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
    8987              :                                     saveddest, *count, 1, OPTAB_DIRECT);
    8988              :       /* We copied at most size + prolog_size.  */
    8989           21 :       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
    8990           14 :         *min_size
    8991           14 :           = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
    8992              :       else
    8993            7 :         *min_size = 0;
    8994              : 
    8995              :       /* Our loops always round down the block size, but for dispatch to
    8996              :          library we need precise value.  */
    8997           21 :       if (dynamic_check)
    8998           21 :         *count = expand_simple_binop (GET_MODE (*count), AND, *count,
    8999              :                                       GEN_INT (-size), *count, 1, OPTAB_DIRECT);
    9000              :     }
    9001              :   else
    9002              :     {
    9003        10423 :       gcc_assert (prolog_size == 0);
    9004              :       /* Decrease count, so we won't end up copying last word twice.  */
    9005        10423 :       if (!CONST_INT_P (*count))
    9006         7121 :         *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
    9007              :                                       constm1_rtx, *count, 1, OPTAB_DIRECT);
    9008              :       else
    9009         3302 :         *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
    9010              :                                       (unsigned HOST_WIDE_INT)size));
    9011        10423 :       if (*min_size)
    9012         9775 :         *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
    9013              :     }
    9014        10444 : }
    9015              : 
    9016              : 
    9017              : /* This function is like the previous one, except here we know how many bytes
    9018              :    need to be copied.  That allows us to update alignment not only of DST, which
    9019              :    is returned, but also of SRC, which is passed as a pointer for that
    9020              :    reason.  */
    9021              : static rtx
    9022            0 : expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
    9023              :                                            rtx srcreg, rtx value, rtx vec_value,
    9024              :                                            int desired_align, int align_bytes,
    9025              :                                            bool issetmem)
    9026              : {
    9027            0 :   rtx src = NULL;
    9028            0 :   rtx orig_dst = dst;
    9029            0 :   rtx orig_src = NULL;
    9030            0 :   int piece_size = 1;
    9031            0 :   int copied_bytes = 0;
    9032              : 
    9033            0 :   if (!issetmem)
    9034              :     {
    9035            0 :       gcc_assert (srcp != NULL);
    9036            0 :       src = *srcp;
    9037            0 :       orig_src = src;
    9038              :     }
    9039              : 
    9040            0 :   for (piece_size = 1;
    9041            0 :        piece_size <= desired_align && copied_bytes < align_bytes;
    9042            0 :        piece_size <<= 1)
    9043              :     {
    9044            0 :       if (align_bytes & piece_size)
    9045              :         {
    9046            0 :           if (issetmem)
    9047              :             {
    9048            0 :               if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
    9049            0 :                 dst = emit_memset (dst, destreg, vec_value, piece_size);
    9050              :               else
    9051            0 :                 dst = emit_memset (dst, destreg, value, piece_size);
    9052              :             }
    9053              :           else
    9054            0 :             dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
    9055            0 :           copied_bytes += piece_size;
    9056              :         }
    9057              :     }
    9058            0 :   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
    9059            0 :     set_mem_align (dst, desired_align * BITS_PER_UNIT);
    9060            0 :   if (MEM_SIZE_KNOWN_P (orig_dst))
    9061            0 :     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
    9062              : 
    9063            0 :   if (!issetmem)
    9064              :     {
    9065            0 :       int src_align_bytes = get_mem_align_offset (src, desired_align
    9066              :                                                        * BITS_PER_UNIT);
    9067            0 :       if (src_align_bytes >= 0)
    9068            0 :         src_align_bytes = desired_align - src_align_bytes;
    9069            0 :       if (src_align_bytes >= 0)
    9070              :         {
    9071              :           unsigned int src_align;
    9072            0 :           for (src_align = desired_align; src_align >= 2; src_align >>= 1)
    9073              :             {
    9074            0 :               if ((src_align_bytes & (src_align - 1))
    9075            0 :                    == (align_bytes & (src_align - 1)))
    9076              :                 break;
    9077              :             }
    9078            0 :           if (src_align > (unsigned int) desired_align)
    9079              :             src_align = desired_align;
    9080            0 :           if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
    9081            0 :             set_mem_align (src, src_align * BITS_PER_UNIT);
    9082              :         }
    9083            0 :       if (MEM_SIZE_KNOWN_P (orig_src))
    9084            0 :         set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
    9085            0 :       *srcp = src;
    9086              :     }
    9087              : 
    9088            0 :   return dst;
    9089              : }
    9090              : 
    9091              : /* Return true if ALG can be used in current context.
    9092              :    Assume we expand memset if MEMSET is true.  */
    9093              : static bool
    9094       829558 : alg_usable_p (enum stringop_alg alg, bool memset,
    9095              :               addr_space_t dst_as, addr_space_t src_as)
    9096              : {
    9097       829558 :   if (alg == no_stringop)
    9098              :     return false;
    9099              :   /* It is not possible to use a library call if we have non-default
    9100              :      address space.  We can do better than the generic byte-at-a-time
    9101              :      loop, used as a fallback.  */
    9102       829558 :   if (alg == libcall &&
    9103       465386 :       !(ADDR_SPACE_GENERIC_P (dst_as) && ADDR_SPACE_GENERIC_P (src_as)))
    9104              :     return false;
    9105       829551 :   if (alg == vector_loop)
    9106       368319 :     return TARGET_SSE || TARGET_AVX;
    9107              :   /* Algorithms using the rep prefix want at least edi and ecx;
    9108              :      additionally, memset wants eax and memcpy wants esi.  Don't
    9109              :      consider such algorithms if the user has appropriated those
    9110              :      registers for their own purposes, or if we have the destination
    9111              :      in the non-default address space, since string insns cannot
    9112              :      override the destination segment.  */
    9113       645360 :   if (alg == rep_prefix_1_byte
    9114              :       || alg == rep_prefix_4_byte
    9115       645360 :       || alg == rep_prefix_8_byte)
    9116              :     {
    9117        31012 :       if (fixed_regs[CX_REG]
    9118        31008 :           || fixed_regs[DI_REG]
    9119        31004 :           || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])
    9120        31000 :           || !ADDR_SPACE_GENERIC_P (dst_as)
    9121        62012 :           || !(ADDR_SPACE_GENERIC_P (src_as) || Pmode == word_mode))
    9122           12 :         return false;
    9123              :     }
    9124              :   return true;
    9125              : }
    9126              : 
    9127              : /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
    9128              : static enum stringop_alg
    9129       164375 : decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
    9130              :             unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
    9131              :             bool memset, bool zero_memset, addr_space_t dst_as,
    9132              :             addr_space_t src_as, int *dynamic_check, bool *noalign, bool recur)
    9133              : {
    9134       164375 :   const struct stringop_algs *algs;
    9135       164375 :   bool optimize_for_speed;
    9136       164375 :   int max = 0;
    9137       164375 :   const struct processor_costs *cost;
    9138       164375 :   int i;
    9139       164375 :   bool any_alg_usable_p = false;
    9140              : 
    9141       164375 :   *noalign = false;
    9142       164375 :   *dynamic_check = -1;
    9143              : 
    9144              :   /* Even if the string operation call is cold, we still might spend a lot
    9145              :      of time processing large blocks.  */
    9146       164375 :   if (optimize_function_for_size_p (cfun)
    9147       164375 :       || (optimize_insn_for_size_p ()
    9148         8299 :           && (max_size < 256
    9149         3196 :               || (expected_size != -1 && expected_size < 256))))
    9150              :     optimize_for_speed = false;
    9151              :   else
    9152       148926 :     optimize_for_speed = true;
    9153              : 
    9154       148926 :   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
    9155       164375 :   if (memset)
    9156        48904 :     algs = &cost->memset[TARGET_64BIT != 0];
    9157              :   else
    9158       124210 :     algs = &cost->memcpy[TARGET_64BIT != 0];
    9159              : 
    9160              :   /* See maximal size for user defined algorithm.  */
    9161       821875 :   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
    9162              :     {
    9163       657500 :       enum stringop_alg candidate = algs->size[i].alg;
    9164       657500 :       bool usable = alg_usable_p (candidate, memset, dst_as, src_as);
    9165       657500 :       any_alg_usable_p |= usable;
    9166              : 
    9167       657500 :       if (candidate != libcall && candidate && usable)
    9168       313234 :         max = algs->size[i].max;
    9169              :     }
    9170              : 
    9171              :   /* If expected size is not known but max size is small enough
    9172              :      so inline version is a win, set expected size into
    9173              :      the range.  */
    9174       164375 :   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
    9175        34847 :       && expected_size == -1)
    9176        18371 :     expected_size = min_size / 2 + max_size / 2;
    9177              : 
    9178              :   /* If user specified the algorithm, honor it if possible.  */
    9179       164375 :   if (ix86_stringop_alg != no_stringop
    9180       164375 :       && alg_usable_p (ix86_stringop_alg, memset, dst_as, src_as))
    9181              :     return ix86_stringop_alg;
    9182              :   /* rep; movq or rep; movl is the smallest variant.  */
    9183       164266 :   else if (!optimize_for_speed)
    9184              :     {
    9185        15369 :       *noalign = true;
    9186        15369 :       if (!count || (count & 3) || (memset && !zero_memset))
    9187         5901 :         return alg_usable_p (rep_prefix_1_byte, memset, dst_as, src_as)
    9188         5901 :                ? rep_prefix_1_byte : loop_1_byte;
    9189              :       else
    9190         9468 :         return alg_usable_p (rep_prefix_4_byte, memset, dst_as, src_as)
    9191         9468 :                ? rep_prefix_4_byte : loop;
    9192              :     }
    9193              :   /* Very tiny blocks are best handled via the loop, REP is expensive to
    9194              :      setup.  */
    9195       148897 :   else if (expected_size != -1 && expected_size < 4)
    9196              :     return loop_1_byte;
    9197       145617 :   else if (expected_size != -1)
    9198              :     {
    9199              :       enum stringop_alg alg = libcall;
    9200              :       bool alg_noalign = false;
    9201       180603 :       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
    9202              :         {
    9203              :           /* We get here if the algorithms that were not libcall-based
    9204              :              were rep-prefix based and we are unable to use rep prefixes
    9205              :              based on global register usage.  Break out of the loop and
    9206              :              use the heuristic below.  */
    9207       177670 :           if (algs->size[i].max == 0)
    9208              :             break;
    9209       177670 :           if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
    9210              :             {
    9211        74710 :               enum stringop_alg candidate = algs->size[i].alg;
    9212              : 
    9213        74710 :               if (candidate != libcall
    9214        74710 :                   && alg_usable_p (candidate, memset, dst_as, src_as))
    9215              :                 {
    9216        20273 :                   alg = candidate;
    9217        20273 :                   alg_noalign = algs->size[i].noalign;
    9218              :                 }
    9219              :               /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
    9220              :                  last non-libcall inline algorithm.  */
    9221        74710 :               if (TARGET_INLINE_ALL_STRINGOPS)
    9222              :                 {
    9223              :                   /* When the current size is best to be copied by a libcall,
    9224              :                      but we are still forced to inline, run the heuristic below
    9225              :                      that will pick code for medium sized blocks.  */
    9226        10992 :                   if (alg != libcall)
    9227              :                     {
    9228         5117 :                       *noalign = alg_noalign;
    9229         5117 :                       return alg;
    9230              :                     }
    9231         5875 :                   else if (!any_alg_usable_p)
    9232              :                     break;
    9233              :                 }
    9234        63718 :               else if (alg_usable_p (candidate, memset, dst_as, src_as)
    9235        63718 :                        && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
    9236           22 :                             && candidate == rep_prefix_1_byte
    9237              :                             /* NB: If min_size != max_size, size is
    9238              :                                unknown.  */
    9239           22 :                             && min_size != max_size))
    9240              :                 {
    9241        63699 :                   *noalign = algs->size[i].noalign;
    9242        63699 :                   return candidate;
    9243              :                 }
    9244              :             }
    9245              :         }
    9246              :     }
    9247              :   /* When asked to inline the call anyway, try to pick meaningful choice.
    9248              :      We look for maximal size of block that is faster to copy by hand and
    9249              :      take blocks of at most of that size guessing that average size will
    9250              :      be roughly half of the block.
    9251              : 
    9252              :      If this turns out to be bad, we might simply specify the preferred
    9253              :      choice in ix86_costs.  */
    9254        72572 :   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
    9255        76807 :       && (algs->unknown_size == libcall
    9256            0 :           || !alg_usable_p (algs->unknown_size, memset, dst_as, src_as)))
    9257              :     {
    9258         4235 :       enum stringop_alg alg;
    9259         4235 :       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
    9260              : 
    9261              :       /* If there aren't any usable algorithms or if recursing already,
    9262              :          then recursing on smaller sizes or same size isn't going to
    9263              :          find anything.  Just return the simple byte-at-a-time copy loop.  */
    9264         4235 :       if (!any_alg_usable_p || recur)
    9265              :         {
    9266              :           /* Pick something reasonable.  */
    9267            0 :           if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
    9268            0 :             *dynamic_check = 128;
    9269            0 :           return loop_1_byte;
    9270              :         }
    9271         4235 :       alg = decide_alg (count, new_expected_size, min_size, max_size,
    9272              :                         memset, zero_memset, dst_as, src_as,
    9273              :                         dynamic_check, noalign, true);
    9274         4235 :       gcc_assert (*dynamic_check == -1);
    9275         4235 :       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
    9276            8 :         *dynamic_check = max;
    9277              :       else
    9278         4227 :         gcc_assert (alg != libcall);
    9279         4235 :       return alg;
    9280              :     }
    9281              : 
    9282              :   /* Try to use some reasonable fallback algorithm.  Note that for
    9283              :      non-default address spaces we default to a loop instead of
    9284              :      a libcall.  */
    9285              : 
    9286        72566 :   bool have_as = !(ADDR_SPACE_GENERIC_P (dst_as)
    9287              :                    && ADDR_SPACE_GENERIC_P (src_as));
    9288              : 
    9289        72566 :   return (alg_usable_p (algs->unknown_size, memset, dst_as, src_as)
    9290        72566 :           ? algs->unknown_size : have_as ? loop : libcall);
    9291              : }
    9292              : 
    9293              : /* Decide on alignment.  We know that the operand is already aligned to ALIGN
    9294              :    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
    9295              : static int
    9296        32885 : decide_alignment (int align,
    9297              :                   enum stringop_alg alg,
    9298              :                   int expected_size,
    9299              :                   machine_mode move_mode)
    9300              : {
    9301        32885 :   int desired_align = 0;
    9302              : 
    9303        32885 :   gcc_assert (alg != no_stringop);
    9304              : 
    9305        32885 :   if (alg == libcall)
    9306              :     return 0;
    9307        32885 :   if (move_mode == VOIDmode)
    9308              :     return 0;
    9309              : 
    9310        32885 :   desired_align = GET_MODE_SIZE (move_mode);
    9311              :   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
    9312              :      copying whole cacheline at once.  */
    9313        32885 :   if (TARGET_CPU_P (PENTIUMPRO)
    9314            0 :       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
    9315        32885 :     desired_align = 8;
    9316              : 
    9317        32885 :   if (optimize_size)
    9318         9317 :     desired_align = 1;
    9319        32885 :   if (desired_align < align)
    9320              :     desired_align = align;
    9321        32885 :   if (expected_size != -1 && expected_size < 4)
    9322            0 :     desired_align = align;
    9323              : 
    9324              :   return desired_align;
    9325              : }
    9326              : 
    9327              : 
    9328              : /* Helper function for memcpy.  For QImode value 0xXY produce
    9329              :    0xXYXYXYXY of wide specified by MODE.  This is essentially
    9330              :    a * 0x10101010, but we can do slightly better than
    9331              :    synth_mult by unwinding the sequence by hand on CPUs with
    9332              :    slow multiply.  */
    9333              : static rtx
    9334        16131 : promote_duplicated_reg (machine_mode mode, rtx val)
    9335              : {
    9336        16131 :   if (val == const0_rtx)
    9337        14313 :     return copy_to_mode_reg (mode, CONST0_RTX (mode));
    9338              : 
    9339         1818 :   machine_mode valmode = GET_MODE (val);
    9340         1818 :   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
    9341              :     {
    9342              :       /* Duplicate the scalar value for integer vector.  */
    9343         1495 :       gcc_assert ((val == const0_rtx || val == constm1_rtx)
    9344              :                   || GET_MODE_INNER (mode) == valmode);
    9345          759 :       rtx dup = gen_reg_rtx (mode);
    9346          759 :       bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
    9347              :                                                    val);
    9348          759 :       gcc_assert (ok);
    9349              :       return dup;
    9350              :     }
    9351              : 
    9352         1059 :   rtx tmp;
    9353         1059 :   int nops = mode == DImode ? 3 : 2;
    9354              : 
    9355           30 :   gcc_assert (mode == SImode || mode == DImode);
    9356         1059 :   if (CONST_INT_P (val))
    9357              :     {
    9358          766 :       HOST_WIDE_INT v = INTVAL (val) & 255;
    9359              : 
    9360          766 :       v |= v << 8;
    9361          766 :       v |= v << 16;
    9362          766 :       if (mode == DImode)
    9363          744 :         v |= (v << 16) << 16;
    9364          766 :       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
    9365              :     }
    9366              : 
    9367          293 :   if (valmode == VOIDmode)
    9368              :     valmode = QImode;
    9369          293 :   if (valmode != QImode)
    9370            0 :     val = gen_lowpart (QImode, val);
    9371          293 :   if (mode == QImode)
    9372              :     return val;
    9373          293 :   if (!TARGET_PARTIAL_REG_STALL)
    9374          293 :     nops--;
    9375          293 :   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
    9376          293 :       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
    9377          293 :       <= (ix86_cost->shift_const + ix86_cost->add) * nops
    9378          293 :           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
    9379              :     {
    9380          293 :       rtx reg = convert_modes (mode, QImode, val, true);
    9381          293 :       tmp = promote_duplicated_reg (mode, const1_rtx);
    9382          293 :       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
    9383          293 :                                   OPTAB_DIRECT);
    9384              :     }
    9385              :   else
    9386              :     {
    9387            0 :       rtx reg = convert_modes (mode, QImode, val, true);
    9388              : 
    9389            0 :       if (!TARGET_PARTIAL_REG_STALL)
    9390            0 :         emit_insn (gen_insv_1 (mode, reg, reg));
    9391              :       else
    9392              :         {
    9393            0 :           tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
    9394              :                                      NULL, 1, OPTAB_DIRECT);
    9395            0 :           reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
    9396              :                                      OPTAB_DIRECT);
    9397              :         }
    9398            0 :       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
    9399              :                                  NULL, 1, OPTAB_DIRECT);
    9400            0 :       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
    9401            0 :       if (mode == SImode)
    9402              :         return reg;
    9403            0 :       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
    9404              :                                  NULL, 1, OPTAB_DIRECT);
    9405            0 :       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
    9406            0 :       return reg;
    9407              :     }
    9408              : }
    9409              : 
    9410              : /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
    9411              :    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
    9412              :    alignment from ALIGN to DESIRED_ALIGN.  */
    9413              : static rtx
    9414        11742 : promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
    9415              :                                 int align)
    9416              : {
    9417        11742 :   rtx promoted_val;
    9418              : 
    9419        11742 :   if (TARGET_64BIT
    9420        10239 :       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
    9421         4498 :     promoted_val = promote_duplicated_reg (DImode, val);
    9422         7244 :   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
    9423         5447 :     promoted_val = promote_duplicated_reg (SImode, val);
    9424         1797 :   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
    9425            0 :     promoted_val = promote_duplicated_reg (HImode, val);
    9426              :   else
    9427              :     promoted_val = val;
    9428              : 
    9429        11742 :   return promoted_val;
    9430              : }
    9431              : 
    9432              : /* Copy the address to a Pmode register.  This is used for x32 to
    9433              :    truncate DImode TLS address to a SImode register. */
    9434              : 
    9435              : static rtx
    9436        66236 : ix86_copy_addr_to_reg (rtx addr)
    9437              : {
    9438        66236 :   rtx reg;
    9439        70817 :   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
    9440              :     {
    9441        66236 :       reg = copy_addr_to_reg (addr);
    9442        66236 :       REG_POINTER (reg) = 1;
    9443        66236 :       return reg;
    9444              :     }
    9445              :   else
    9446              :     {
    9447            0 :       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
    9448            0 :       reg = copy_to_mode_reg (DImode, addr);
    9449            0 :       REG_POINTER (reg) = 1;
    9450            0 :       return gen_rtx_SUBREG (SImode, reg, 0);
    9451              :     }
    9452              : }
    9453              : 
    9454              : /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
    9455              :    operations when profitable.  The code depends upon architecture, block size
    9456              :    and alignment, but always has one of the following overall structures:
    9457              : 
    9458              :    Aligned move sequence:
    9459              : 
    9460              :      1) Prologue guard: Conditional that jumps up to epilogues for small
    9461              :         blocks that can be handled by epilogue alone.  This is faster
    9462              :         but also needed for correctness, since prologue assume the block
    9463              :         is larger than the desired alignment.
    9464              : 
    9465              :         Optional dynamic check for size and libcall for large
    9466              :         blocks is emitted here too, with -minline-stringops-dynamically.
    9467              : 
    9468              :      2) Prologue: copy first few bytes in order to get destination
    9469              :         aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
    9470              :         than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
    9471              :         copied.  We emit either a jump tree on power of two sized
    9472              :         blocks, or a byte loop.
    9473              : 
    9474              :      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
    9475              :         with specified algorithm.
    9476              : 
    9477              :      4) Epilogue: code copying tail of the block that is too small to be
    9478              :         handled by main body (or up to size guarded by prologue guard).
    9479              : 
    9480              :   Misaligned move sequence
    9481              : 
    9482              :      1) missaligned move prologue/epilogue containing:
    9483              :         a) Prologue handling small memory blocks and jumping to done_label
    9484              :            (skipped if blocks are known to be large enough)
    9485              :         b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
    9486              :            needed by single possibly misaligned move
    9487              :            (skipped if alignment is not needed)
    9488              :         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
    9489              : 
    9490              :      2) Zero size guard dispatching to done_label, if needed
    9491              : 
    9492              :      3) dispatch to library call, if needed,
    9493              : 
    9494              :      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
    9495              :         with specified algorithm.  */
    9496              : bool
    9497       145522 : ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
    9498              :                            rtx align_exp, rtx expected_align_exp,
    9499              :                            rtx expected_size_exp, rtx min_size_exp,
    9500              :                            rtx max_size_exp, rtx probable_max_size_exp,
    9501              :                            bool issetmem)
    9502              : {
    9503       145522 :   rtx destreg;
    9504       145522 :   rtx srcreg = NULL;
    9505       145522 :   rtx_code_label *label = NULL;
    9506       145522 :   rtx tmp;
    9507       145522 :   rtx_code_label *jump_around_label = NULL;
    9508       145522 :   HOST_WIDE_INT align = 1;
    9509       145522 :   unsigned HOST_WIDE_INT count = 0;
    9510       145522 :   HOST_WIDE_INT expected_size = -1;
    9511       145522 :   int size_needed = 0, epilogue_size_needed;
    9512       145522 :   int desired_align = 0, align_bytes = 0;
    9513       145522 :   enum stringop_alg alg;
    9514       145522 :   rtx promoted_val = NULL;
    9515       145522 :   rtx vec_promoted_val = NULL;
    9516       145522 :   bool force_loopy_epilogue = false;
    9517       145522 :   int dynamic_check;
    9518       145522 :   bool need_zero_guard = false;
    9519       145522 :   bool noalign;
    9520       145522 :   machine_mode move_mode = VOIDmode;
    9521       145522 :   int unroll_factor = 1;
    9522              :   /* TODO: Once value ranges are available, fill in proper data.  */
    9523       145522 :   unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
    9524       145522 :   unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U;
    9525       145522 :   unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
    9526       145522 :   bool misaligned_prologue_used = false;
    9527       145522 :   addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC;
    9528              : 
    9529       145522 :   if (CONST_INT_P (align_exp))
    9530       145522 :     align = INTVAL (align_exp);
    9531              :   /* i386 can do misaligned access on reasonably increased cost.  */
    9532       145522 :   if (CONST_INT_P (expected_align_exp)
    9533       145522 :       && INTVAL (expected_align_exp) > align)
    9534              :     align = INTVAL (expected_align_exp);
    9535              :   /* ALIGN is the minimum of destination and source alignment, but we care here
    9536              :      just about destination alignment.  */
    9537       138930 :   else if (!issetmem
    9538       233547 :            && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
    9539         2980 :     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
    9540              : 
    9541       145522 :   if (CONST_INT_P (count_exp))
    9542              :     {
    9543        65810 :       min_size = max_size = probable_max_size = count = expected_size
    9544        65810 :         = INTVAL (count_exp);
    9545              :       /* When COUNT is 0, there is nothing to do.  */
    9546        65810 :       if (!count)
    9547              :         return true;
    9548              :     }
    9549              :   else
    9550              :     {
    9551        79712 :       if (min_size_exp)
    9552        79712 :         min_size = INTVAL (min_size_exp);
    9553        79712 :       if (max_size_exp)
    9554        66508 :         max_size = INTVAL (max_size_exp);
    9555        79712 :       if (probable_max_size_exp)
    9556        68413 :         probable_max_size = INTVAL (probable_max_size_exp);
    9557        79712 :       if (CONST_INT_P (expected_size_exp))
    9558        79712 :         expected_size = INTVAL (expected_size_exp);
    9559              :      }
    9560              : 
    9561              :   /* Make sure we don't need to care about overflow later on.  */
    9562       145520 :   if (count > (HOST_WIDE_INT_1U << 30))
    9563              :     return false;
    9564              : 
    9565       145345 :   dst_as = MEM_ADDR_SPACE (dst);
    9566       145345 :   if (!issetmem)
    9567       101098 :     src_as = MEM_ADDR_SPACE (src);
    9568              : 
    9569              :   /* Step 0: Decide on preferred algorithm, desired alignment and
    9570              :      size of chunks to be copied by main loop.  */
    9571       145345 :   alg = decide_alg (count, expected_size, min_size, probable_max_size,
    9572        44247 :                     issetmem, issetmem && val_exp == const0_rtx,
    9573              :                     dst_as, src_as, &dynamic_check, &noalign, false);
    9574              : 
    9575       145345 :   if (dump_file)
    9576            7 :     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
    9577            7 :              stringop_alg_names[alg]);
    9578              : 
    9579       145345 :   if (alg == libcall)
    9580              :     return false;
    9581        32885 :   gcc_assert (alg != no_stringop);
    9582              : 
    9583        32885 :   if (!count)
    9584        15726 :     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
    9585        32885 :   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
    9586        32885 :   if (!issetmem)
    9587        21143 :     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
    9588              : 
    9589        32885 :   bool aligned_dstmem = false;
    9590        32885 :   unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
    9591        32885 :   bool single_insn_p = count && count <= nunits;
    9592        32885 :   if (single_insn_p)
    9593              :     {
    9594              :       /* If it can be done with a single instruction, use vector
    9595              :          instruction and don't align destination.  */
    9596            6 :       alg = vector_loop;
    9597            6 :       noalign = true;
    9598            6 :       dynamic_check = -1;
    9599              :     }
    9600              : 
    9601        32885 :   unroll_factor = 1;
    9602        32885 :   move_mode = word_mode;
    9603        32885 :   switch (alg)
    9604              :     {
    9605            0 :     case libcall:
    9606            0 :     case no_stringop:
    9607            0 :     case last_alg:
    9608            0 :       gcc_unreachable ();
    9609         2072 :     case loop_1_byte:
    9610         2072 :       need_zero_guard = true;
    9611         2072 :       move_mode = QImode;
    9612         2072 :       break;
    9613           45 :     case loop:
    9614           45 :       need_zero_guard = true;
    9615           45 :       break;
    9616           20 :     case unrolled_loop:
    9617           20 :       need_zero_guard = true;
    9618           20 :       unroll_factor = (TARGET_64BIT ? 4 : 2);
    9619              :       break;
    9620        16153 :     case vector_loop:
    9621        16153 :       need_zero_guard = true;
    9622        16153 :       unroll_factor = 4;
    9623              :       /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes.  */
    9624        16153 :       nunits /= GET_MODE_SIZE (word_mode);
    9625        16153 :       if (nunits > 1)
    9626              :         {
    9627        16149 :           move_mode = mode_for_vector (word_mode, nunits).require ();
    9628        16149 :           gcc_assert (optab_handler (mov_optab, move_mode)
    9629              :                       != CODE_FOR_nothing);
    9630              :         }
    9631              :       break;
    9632           24 :     case rep_prefix_8_byte:
    9633           24 :       move_mode = DImode;
    9634           24 :       break;
    9635         9364 :     case rep_prefix_4_byte:
    9636         9364 :       move_mode = SImode;
    9637         9364 :       break;
    9638         5207 :     case rep_prefix_1_byte:
    9639         5207 :       move_mode = QImode;
    9640         5207 :       break;
    9641              :     }
    9642        32885 :   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
    9643        32885 :   epilogue_size_needed = size_needed;
    9644              : 
    9645              :   /* If we are going to call any library calls conditionally, make sure any
    9646              :      pending stack adjustment happen before the first conditional branch,
    9647              :      otherwise they will be emitted before the library call only and won't
    9648              :      happen from the other branches.  */
    9649        32885 :   if (dynamic_check != -1)
    9650            7 :     do_pending_stack_adjust ();
    9651              : 
    9652        32885 :   desired_align = decide_alignment (align, alg, expected_size, move_mode);
    9653        32885 :   if (!TARGET_ALIGN_STRINGOPS || noalign)
    9654        30729 :     align = desired_align;
    9655              : 
    9656              :   /* Step 1: Prologue guard.  */
    9657              : 
    9658              :   /* Alignment code needs count to be in register.  */
    9659        32885 :   if (CONST_INT_P (count_exp) && desired_align > align)
    9660              :     {
    9661           20 :       if (INTVAL (count_exp) > desired_align
    9662           20 :           && INTVAL (count_exp) > size_needed)
    9663              :         {
    9664           20 :           align_bytes
    9665           20 :             = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
    9666           20 :           if (align_bytes <= 0)
    9667              :             align_bytes = 0;
    9668              :           else
    9669            0 :             align_bytes = desired_align - align_bytes;
    9670              :         }
    9671            0 :       if (align_bytes == 0)
    9672           40 :         count_exp = force_reg (counter_mode (count_exp), count_exp);
    9673              :     }
    9674        32885 :   gcc_assert (desired_align >= 1 && align >= 1);
    9675              : 
    9676        32885 :   if (!single_insn_p)
    9677              :     {
    9678              :       /* Misaligned move sequences handle both prologue and epilogue
    9679              :          at once.  Default code generation results in a smaller code
    9680              :          for large alignments and also avoids redundant job when sizes
    9681              :          are known precisely.  */
    9682        32879 :       misaligned_prologue_used
    9683        65758 :         = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
    9684        32873 :            && MAX (desired_align, epilogue_size_needed) <= 32
    9685        16472 :            && desired_align <= epilogue_size_needed
    9686        38941 :            && ((desired_align > align && !align_bytes)
    9687         6041 :                || (!count && epilogue_size_needed > 1)));
    9688              : 
    9689              :       /* Destination is aligned after the misaligned prologue.  */
    9690        32879 :       aligned_dstmem = misaligned_prologue_used;
    9691              : 
    9692        32879 :       if (noalign && !misaligned_prologue_used)
    9693              :         {
    9694              :           /* Also use misaligned prologue if alignment isn't needed and
    9695              :              destination isn't aligned.   Since alignment isn't needed,
    9696              :              the destination after prologue won't be aligned.  */
    9697        30723 :           aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
    9698        30723 :                             <= MEM_ALIGN (dst));
    9699        30723 :           if (!aligned_dstmem)
    9700        10423 :             misaligned_prologue_used = true;
    9701              :         }
    9702              :     }
    9703              : 
    9704              :   /* Do the cheap promotion to allow better CSE across the
    9705              :      main loop and epilogue (ie one load of the big constant in the
    9706              :      front of all code.
    9707              :      For now the misaligned move sequences do not have fast path
    9708              :      without broadcasting.  */
    9709        32885 :   if (issetmem
    9710        11742 :       && (alg == vector_loop
    9711         5849 :           || CONST_INT_P (val_exp)
    9712           47 :           || misaligned_prologue_used))
    9713              :     {
    9714         5802 :       if (alg == vector_loop)
    9715              :         {
    9716         5893 :           promoted_val = promote_duplicated_reg_to_size (val_exp,
    9717        11786 :                                                          GET_MODE_SIZE (word_mode),
    9718              :                                                          desired_align, align);
    9719              :           /* Duplicate the promoted scalar value if not 0 nor -1.  */
    9720         5893 :           vec_promoted_val
    9721         5893 :             = promote_duplicated_reg (move_mode,
    9722         5893 :                                       (val_exp == const0_rtx
    9723          759 :                                        || val_exp == constm1_rtx)
    9724              :                                       ? val_exp : promoted_val);
    9725              :         }
    9726              :       else
    9727              :         {
    9728         5802 :           promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
    9729              :                                                          desired_align, align);
    9730              :         }
    9731              :     }
    9732              :   /* Misaligned move sequences handles both prologues and epilogues at once.
    9733              :      Default code generation results in smaller code for large alignments and
    9734              :      also avoids redundant job when sizes are known precisely.  */
    9735        32838 :   if (misaligned_prologue_used)
    9736              :     {
    9737              :       /* Misaligned move prologue handled small blocks by itself.  */
    9738        10444 :       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
    9739        10444 :            (dst, src, &destreg, &srcreg,
    9740              :             move_mode, promoted_val, vec_promoted_val,
    9741              :             &count_exp,
    9742              :             &jump_around_label,
    9743        10444 :             desired_align < align
    9744            0 :             ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
    9745              :             desired_align, align, &min_size, dynamic_check, issetmem);
    9746        10444 :       if (!issetmem)
    9747         7843 :         src = change_address (src, BLKmode, srcreg);
    9748        10444 :       dst = change_address (dst, BLKmode, destreg);
    9749        10444 :       if (aligned_dstmem)
    9750           21 :         set_mem_align (dst, desired_align * BITS_PER_UNIT);
    9751        10444 :       epilogue_size_needed = 0;
    9752        10444 :       if (need_zero_guard
    9753        10184 :           && min_size < (unsigned HOST_WIDE_INT) size_needed)
    9754              :         {
    9755              :           /* It is possible that we copied enough so the main loop will not
    9756              :              execute.  */
    9757         7118 :           gcc_assert (size_needed > 1);
    9758         7118 :           if (jump_around_label == NULL_RTX)
    9759           68 :             jump_around_label = gen_label_rtx ();
    9760        14236 :           emit_cmp_and_jump_insns (count_exp,
    9761              :                                    GEN_INT (size_needed),
    9762              :                                    LTU, 0, counter_mode (count_exp), 1, jump_around_label);
    9763         7118 :           if (expected_size == -1
    9764           53 :               || expected_size < (desired_align - align) / 2 + size_needed)
    9765         7066 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9766              :           else
    9767           52 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9768              :         }
    9769              :     }
    9770              :   /* Ensure that alignment prologue won't copy past end of block.  */
    9771        22441 :   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
    9772              :     {
    9773        15162 :       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
    9774              :       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
    9775              :          Make sure it is power of 2.  */
    9776        15162 :       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
    9777              : 
    9778              :       /* To improve performance of small blocks, we jump around the VAL
    9779              :          promoting mode.  This mean that if the promoted VAL is not constant,
    9780              :          we might not use it in the epilogue and have to use byte
    9781              :          loop variant.  */
    9782        15162 :       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
    9783        15162 :         force_loopy_epilogue = true;
    9784        15162 :       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9785        15154 :           || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9786              :         {
    9787              :           /* If main algorithm works on QImode, no epilogue is needed.
    9788              :              For small sizes just don't align anything.  */
    9789         2119 :           if (size_needed == 1)
    9790            0 :             desired_align = align;
    9791              :           else
    9792         2119 :             goto epilogue;
    9793              :         }
    9794        13043 :       else if (!count
    9795          251 :                && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
    9796              :         {
    9797          251 :           label = gen_label_rtx ();
    9798          502 :           emit_cmp_and_jump_insns (count_exp,
    9799              :                                    GEN_INT (epilogue_size_needed),
    9800              :                                    LTU, 0, counter_mode (count_exp), 1, label);
    9801          251 :           if (expected_size == -1 || expected_size < epilogue_size_needed)
    9802          251 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9803              :           else
    9804            0 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9805              :         }
    9806              :     }
    9807              : 
    9808              :   /* Emit code to decide on runtime whether library call or inline should be
    9809              :      used.  */
    9810        30766 :   if (dynamic_check != -1)
    9811              :     {
    9812            7 :       if (!issetmem && CONST_INT_P (count_exp))
    9813              :         {
    9814            1 :           if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
    9815              :             {
    9816            1 :               emit_block_copy_via_libcall (dst, src, count_exp);
    9817            1 :               count_exp = const0_rtx;
    9818            1 :               goto epilogue;
    9819              :             }
    9820              :         }
    9821              :       else
    9822              :         {
    9823            6 :           rtx_code_label *hot_label = gen_label_rtx ();
    9824            6 :           if (jump_around_label == NULL_RTX)
    9825            1 :             jump_around_label = gen_label_rtx ();
    9826           12 :           emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
    9827              :                                    LEU, 0, counter_mode (count_exp),
    9828              :                                    1, hot_label);
    9829            6 :           predict_jump (REG_BR_PROB_BASE * 90 / 100);
    9830            6 :           if (issetmem)
    9831            4 :             set_storage_via_libcall (dst, count_exp, val_exp);
    9832              :           else
    9833            2 :             emit_block_copy_via_libcall (dst, src, count_exp);
    9834            6 :           emit_jump (jump_around_label);
    9835            6 :           emit_label (hot_label);
    9836              :         }
    9837              :     }
    9838              : 
    9839              :   /* Step 2: Alignment prologue.  */
    9840              :   /* Do the expensive promotion once we branched off the small blocks.  */
    9841        30765 :   if (issetmem && !promoted_val)
    9842           47 :     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
    9843              :                                                    desired_align, align);
    9844              : 
    9845        30765 :   if (desired_align > align && !misaligned_prologue_used)
    9846              :     {
    9847            7 :       if (align_bytes == 0)
    9848              :         {
    9849              :           /* Except for the first move in prologue, we no longer know
    9850              :              constant offset in aliasing info.  It don't seems to worth
    9851              :              the pain to maintain it for the first move, so throw away
    9852              :              the info early.  */
    9853            7 :           dst = change_address (dst, BLKmode, destreg);
    9854            7 :           if (!issetmem)
    9855            5 :             src = change_address (src, BLKmode, srcreg);
    9856            7 :           dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
    9857              :                                             promoted_val, vec_promoted_val,
    9858              :                                             count_exp, align, desired_align,
    9859              :                                             issetmem);
    9860              :           /* At most desired_align - align bytes are copied.  */
    9861            7 :           if (min_size < (unsigned)(desired_align - align))
    9862            0 :             min_size = 0;
    9863              :           else
    9864            7 :             min_size -= desired_align - align;
    9865              :         }
    9866              :       else
    9867              :         {
    9868              :           /* If we know how many bytes need to be stored before dst is
    9869              :              sufficiently aligned, maintain aliasing info accurately.  */
    9870            0 :           dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
    9871              :                                                            srcreg,
    9872              :                                                            promoted_val,
    9873              :                                                            vec_promoted_val,
    9874              :                                                            desired_align,
    9875              :                                                            align_bytes,
    9876              :                                                            issetmem);
    9877              : 
    9878            0 :           count_exp = plus_constant (counter_mode (count_exp),
    9879            0 :                                      count_exp, -align_bytes);
    9880            0 :           count -= align_bytes;
    9881            0 :           min_size -= align_bytes;
    9882            0 :           max_size -= align_bytes;
    9883              :         }
    9884            7 :       if (need_zero_guard
    9885            7 :           && min_size < (unsigned HOST_WIDE_INT) size_needed
    9886            1 :           && (count < (unsigned HOST_WIDE_INT) size_needed
    9887            0 :               || (align_bytes == 0
    9888            0 :                   && count < ((unsigned HOST_WIDE_INT) size_needed
    9889            0 :                               + desired_align - align))))
    9890              :         {
    9891              :           /* It is possible that we copied enough so the main loop will not
    9892              :              execute.  */
    9893            1 :           gcc_assert (size_needed > 1);
    9894            1 :           if (label == NULL_RTX)
    9895            0 :             label = gen_label_rtx ();
    9896            2 :           emit_cmp_and_jump_insns (count_exp,
    9897              :                                    GEN_INT (size_needed),
    9898              :                                    LTU, 0, counter_mode (count_exp), 1, label);
    9899            1 :           if (expected_size == -1
    9900            0 :               || expected_size < (desired_align - align) / 2 + size_needed)
    9901            1 :             predict_jump (REG_BR_PROB_BASE * 20 / 100);
    9902              :           else
    9903            0 :             predict_jump (REG_BR_PROB_BASE * 60 / 100);
    9904              :         }
    9905              :     }
    9906        30765 :   if (label && size_needed == 1)
    9907              :     {
    9908            0 :       emit_label (label);
    9909            0 :       LABEL_NUSES (label) = 1;
    9910            0 :       label = NULL;
    9911            0 :       epilogue_size_needed = 1;
    9912            0 :       if (issetmem)
    9913            0 :         promoted_val = val_exp;
    9914              :     }
    9915        30765 :   else if (label == NULL_RTX && !misaligned_prologue_used)
    9916        20071 :     epilogue_size_needed = size_needed;
    9917              : 
    9918              :   /* Step 3: Main loop.  */
    9919              : 
    9920        30765 :   switch (alg)
    9921              :     {
    9922            0 :     case libcall:
    9923            0 :     case no_stringop:
    9924            0 :     case last_alg:
    9925            0 :       gcc_unreachable ();
    9926         2137 :     case loop_1_byte:
    9927         2137 :     case loop:
    9928         2137 :     case unrolled_loop:
    9929         2137 :       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
    9930              :                                      count_exp, move_mode, unroll_factor,
    9931              :                                      expected_size, issetmem);
    9932         2137 :       break;
    9933        14033 :     case vector_loop:
    9934        14033 :       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
    9935              :                                      vec_promoted_val, count_exp, move_mode,
    9936              :                                      unroll_factor, expected_size, issetmem);
    9937        14033 :       break;
    9938        14595 :     case rep_prefix_8_byte:
    9939        14595 :     case rep_prefix_4_byte:
    9940        14595 :     case rep_prefix_1_byte:
    9941        14595 :       expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
    9942              :                                        val_exp, count_exp, move_mode, issetmem);
    9943        14595 :       break;
    9944              :     }
    9945              :   /* Adjust properly the offset of src and dest memory for aliasing.  */
    9946        30765 :   if (CONST_INT_P (count_exp))
    9947              :     {
    9948        17130 :       if (!issetmem)
    9949         7922 :         src = adjust_automodify_address_nv (src, BLKmode, srcreg,
    9950              :                                             (count / size_needed) * size_needed);
    9951        17130 :       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
    9952              :                                           (count / size_needed) * size_needed);
    9953              :     }
    9954              :   else
    9955              :     {
    9956        13635 :       if (!issetmem)
    9957        11395 :         src = change_address (src, BLKmode, srcreg);
    9958        13635 :       dst = change_address (dst, BLKmode, destreg);
    9959              :     }
    9960              : 
    9961              :   /* Step 4: Epilogue to copy the remaining bytes.  */
    9962        32885 :  epilogue:
    9963        32885 :   if (label)
    9964              :     {
    9965              :       /* When the main loop is done, COUNT_EXP might hold original count,
    9966              :          while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
    9967              :          Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
    9968              :          bytes. Compensate if needed.  */
    9969              : 
    9970          251 :       if (size_needed < epilogue_size_needed)
    9971              :         {
    9972            0 :           tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
    9973            0 :                                      GEN_INT (size_needed - 1), count_exp, 1,
    9974              :                                      OPTAB_DIRECT);
    9975            0 :           if (tmp != count_exp)
    9976            0 :             emit_move_insn (count_exp, tmp);
    9977              :         }
    9978          251 :       emit_label (label);
    9979          251 :       LABEL_NUSES (label) = 1;
    9980              :     }
    9981              : 
    9982        32885 :   if (count_exp != const0_rtx && epilogue_size_needed > 1)
    9983              :     {
    9984        15162 :       if (force_loopy_epilogue)
    9985            0 :         expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
    9986              :                                          epilogue_size_needed);
    9987              :       else
    9988              :         {
    9989        15162 :           if (issetmem)
    9990         7344 :             expand_setmem_epilogue (dst, destreg, promoted_val,
    9991              :                                     vec_promoted_val, count_exp,
    9992              :                                     epilogue_size_needed);
    9993              :           else
    9994         7818 :             expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
    9995              :                                     epilogue_size_needed);
    9996              :         }
    9997              :     }
    9998        32885 :   if (jump_around_label)
    9999         7120 :     emit_label (jump_around_label);
   10000              :   return true;
   10001              : }
   10002              : 
   10003              : /* Fully unroll memmove of known size with up to 8 registers.  */
   10004              : 
   10005              : static bool
   10006         2145 : ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg,
   10007              :                            unsigned HOST_WIDE_INT count,
   10008              :                            machine_mode mode)
   10009              : {
   10010              :   /* If 8 registers registers can cover all memory, load them into
   10011              :      registers and store them together to avoid possible address
   10012              :      overlap between source and destination.  */
   10013         2145 :   unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode);
   10014         2145 :   if (moves == 0)
   10015              :     {
   10016            0 :       mode = smallest_int_mode_for_size
   10017            0 :         (count * BITS_PER_UNIT).require ();
   10018            0 :       if (count == GET_MODE_SIZE (mode))
   10019              :         moves = 1;
   10020              :       else
   10021              :         {
   10022              :           /* Reduce the smallest move size by half so that MOVES == 1.  */
   10023            0 :           mode = smallest_int_mode_for_size
   10024            0 :             (GET_MODE_BITSIZE (mode) / 2).require ();
   10025            0 :           moves = count / GET_MODE_SIZE (mode);
   10026            0 :           gcc_assert (moves == 1);
   10027              :         }
   10028              :     }
   10029         2145 :   else if (moves > 8)
   10030              :     return false;
   10031              : 
   10032         2131 :   unsigned int i;
   10033         2131 :   rtx tmp[9];
   10034              : 
   10035         4847 :   for (i = 0; i < moves; i++)
   10036         2716 :     tmp[i] = gen_reg_rtx (mode);
   10037              : 
   10038         2131 :   rtx srcmem = change_address (src, mode, srcreg);
   10039         6978 :   for (i = 0; i < moves; i++)
   10040              :     {
   10041         2716 :       emit_move_insn (tmp[i], srcmem);
   10042         5432 :       srcmem = offset_address (srcmem,
   10043         2716 :                                GEN_INT (GET_MODE_SIZE (mode)),
   10044         2716 :                                GET_MODE_SIZE (mode));
   10045              :     }
   10046              : 
   10047         2131 :   unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1);
   10048         2131 :   machine_mode epilogue_mode = VOIDmode;
   10049         2131 :   if (epilogue_size)
   10050              :     {
   10051              :       /* Handle the remaining bytes with overlapping move.  */
   10052         1950 :       epilogue_mode = smallest_int_mode_for_size
   10053         1950 :         (epilogue_size * BITS_PER_UNIT).require ();
   10054         1950 :       tmp[8] = gen_reg_rtx (epilogue_mode);
   10055         1950 :       srcmem = adjust_address (srcmem, epilogue_mode, 0);
   10056         1950 :       srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1);
   10057         3900 :       srcmem = offset_address (srcmem,
   10058         1950 :                                GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
   10059         1950 :                                GET_MODE_SIZE (epilogue_mode));
   10060         1950 :       emit_move_insn (tmp[8], srcmem);
   10061              :     }
   10062              : 
   10063         2131 :   rtx destmem = change_address (dst, mode, destreg);
   10064         6978 :   for (i = 0; i < moves; i++)
   10065              :     {
   10066         2716 :       emit_move_insn (destmem, tmp[i]);
   10067         5432 :       destmem = offset_address (destmem,
   10068         2716 :                                 GEN_INT (GET_MODE_SIZE (mode)),
   10069         2716 :                                 GET_MODE_SIZE (mode));
   10070              :     }
   10071              : 
   10072         2131 :   if (epilogue_size)
   10073              :     {
   10074              :       /* Use overlapping move.  */
   10075         1950 :       destmem = adjust_address (destmem, epilogue_mode, 0);
   10076         1950 :       destmem = offset_address (destmem, GEN_INT (epilogue_size), 1);
   10077         3900 :       destmem = offset_address (destmem,
   10078         1950 :                                 GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
   10079         1950 :                                 GET_MODE_SIZE (epilogue_mode));
   10080         1950 :       emit_move_insn (destmem, tmp[8]);
   10081              :     }
   10082              : 
   10083              :   return true;
   10084              : }
   10085              : 
   10086              : /* Expand memmove of size with MOVES * mode size and MOVES <= 4.  If
   10087              :    FORWARD is true, copy forward.  Otherwise copy backward.  */
   10088              : 
   10089              : static void
   10090         2944 : ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode,
   10091              :                            unsigned int moves, bool forward)
   10092              : {
   10093         2944 :   gcc_assert (moves <= 4);
   10094              : 
   10095              :   unsigned int i;
   10096              :   rtx tmp[8];
   10097              : 
   10098        14720 :   for (i = 0; i < moves; i++)
   10099        11776 :     tmp[i] = gen_reg_rtx (mode);
   10100              : 
   10101         2944 :   rtx step;
   10102         2944 :   if (forward)
   10103         2944 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10104              :   else
   10105         2944 :     step = GEN_INT (-GET_MODE_SIZE (mode));
   10106              : 
   10107              :   /* Load MOVES.  */
   10108        11776 :   for (i = 0; i < moves - 1; i++)
   10109              :     {
   10110         8832 :       emit_move_insn (tmp[i], srcmem);
   10111        17664 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10112              :     }
   10113         2944 :   emit_move_insn (tmp[i], srcmem);
   10114              : 
   10115              :   /* Store MOVES.  */
   10116        14720 :   for (i = 0; i < moves - 1; i++)
   10117              :     {
   10118         8832 :       emit_move_insn (destmem, tmp[i]);
   10119        17664 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10120              :     }
   10121         2944 :   emit_move_insn (destmem, tmp[i]);
   10122         2944 : }
   10123              : 
   10124              : /* Load MOVES of mode size into REGS.  If LAST is true, load the
   10125              :    last MOVES.  Otherwise, load the first MOVES.  */
   10126              : 
   10127              : static void
   10128         2944 : ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp,
   10129              :                          machine_mode mode, unsigned int moves,
   10130              :                          rtx regs[], bool last)
   10131              : {
   10132         2944 :   unsigned int i;
   10133              : 
   10134        14720 :   for (i = 0; i < moves; i++)
   10135        11776 :     regs[i] = gen_reg_rtx (mode);
   10136              : 
   10137         2944 :   rtx srcmem = change_address (src, mode, srcreg);
   10138         2944 :   rtx step;
   10139         2944 :   if (last)
   10140              :     {
   10141         1472 :       srcmem = offset_address (srcmem, count_exp, 1);
   10142         2944 :       step = GEN_INT (-GET_MODE_SIZE (mode));
   10143         2944 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10144              :     }
   10145              :   else
   10146         2944 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10147              : 
   10148        11776 :   for (i = 0; i < moves - 1; i++)
   10149              :     {
   10150         8832 :       emit_move_insn (regs[i], srcmem);
   10151        17664 :       srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10152              :     }
   10153         2944 :   emit_move_insn (regs[i], srcmem);
   10154         2944 : }
   10155              : 
   10156              : /* Store MOVES of mode size into REGS.  If LAST is true, store the
   10157              :    last MOVES.  Otherwise, store the first MOVES.  */
   10158              : 
   10159              : static void
   10160         2944 : ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp,
   10161              :                           machine_mode mode, unsigned int moves,
   10162              :                           rtx regs[], bool last)
   10163              : {
   10164         2944 :   unsigned int i;
   10165              : 
   10166         2944 :   rtx destmem = change_address (dst, mode, destreg);
   10167         2944 :   rtx step;
   10168         2944 :   if (last)
   10169              :     {
   10170         1472 :       destmem = offset_address (destmem, count_exp, 1);
   10171         2944 :       step = GEN_INT (-GET_MODE_SIZE (mode));
   10172         2944 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10173              :     }
   10174              :   else
   10175         2944 :     step = GEN_INT (GET_MODE_SIZE (mode));
   10176              : 
   10177        11776 :   for (i = 0; i < moves - 1; i++)
   10178              :     {
   10179         8832 :       emit_move_insn (destmem, regs[i]);
   10180        17664 :       destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10181              :     }
   10182         2944 :   emit_move_insn (destmem, regs[i]);
   10183         2944 : }
   10184              : 
   10185              : /* Expand memmove of size between (MOVES / 2) * mode size and
   10186              :    MOVES * mode size with overlapping load and store.  MOVES is even.
   10187              :    MOVES >= 2 and MOVES <= 8.  */
   10188              : 
   10189              : static void
   10190        14903 : ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg,
   10191              :                                        rtx srcreg, rtx count_exp,
   10192              :                                        machine_mode mode,
   10193              :                                        unsigned int moves)
   10194              : {
   10195        14903 :   gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0);
   10196              : 
   10197        14903 :   unsigned int half_moves = moves / 2;
   10198        14903 :   unsigned int i, j;
   10199        14903 :   rtx tmp[8];
   10200              : 
   10201        57569 :   for (i = 0; i < moves; i++)
   10202        42666 :     tmp[i] = gen_reg_rtx (mode);
   10203              : 
   10204        14903 :   rtx base_srcmem = change_address (src, mode, srcreg);
   10205              : 
   10206              :   /* Load the first half.  */
   10207        14903 :   rtx srcmem = base_srcmem;
   10208        36236 :   for (i = 0; i < half_moves - 1; i++)
   10209              :     {
   10210         6430 :       emit_move_insn (tmp[i], srcmem);
   10211        12860 :       srcmem = offset_address (srcmem,
   10212         6430 :                                GEN_INT (GET_MODE_SIZE (mode)),
   10213         6430 :                                GET_MODE_SIZE (mode));
   10214              :     }
   10215        14903 :   emit_move_insn (tmp[i], srcmem);
   10216              : 
   10217              :   /* Load the second half.  */
   10218        14903 :   srcmem = offset_address (base_srcmem, count_exp, 1);
   10219        14903 :   srcmem = offset_address (srcmem,
   10220        14903 :                            GEN_INT (-GET_MODE_SIZE (mode)),
   10221        14903 :                            GET_MODE_SIZE (mode));
   10222        36236 :   for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
   10223              :     {
   10224         6430 :       emit_move_insn (tmp[j], srcmem);
   10225        12860 :       srcmem = offset_address (srcmem,
   10226         6430 :                                GEN_INT (-GET_MODE_SIZE (mode)),
   10227         6430 :                                GET_MODE_SIZE (mode));
   10228              :     }
   10229        14903 :   emit_move_insn (tmp[j], srcmem);
   10230              : 
   10231        14903 :   rtx base_destmem = change_address (dst, mode, destreg);
   10232              : 
   10233              :   /* Store the first half.  */
   10234        14903 :   rtx destmem = base_destmem;
   10235        36236 :   for (i = 0; i < half_moves - 1; i++)
   10236              :     {
   10237         6430 :       emit_move_insn (destmem, tmp[i]);
   10238        12860 :       destmem = offset_address (destmem,
   10239         6430 :                                 GEN_INT (GET_MODE_SIZE (mode)),
   10240         6430 :                                 GET_MODE_SIZE (mode));
   10241              :     }
   10242        14903 :   emit_move_insn (destmem, tmp[i]);
   10243              : 
   10244              :   /* Store the second half.  */
   10245        14903 :   destmem = offset_address (base_destmem, count_exp, 1);
   10246        29806 :   destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
   10247        14903 :                             GET_MODE_SIZE (mode));
   10248        36236 :   for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
   10249              :     {
   10250         6430 :       emit_move_insn (destmem, tmp[j]);
   10251        12860 :       destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
   10252         6430 :                                 GET_MODE_SIZE (mode));
   10253              :     }
   10254        14903 :   emit_move_insn (destmem, tmp[j]);
   10255        14903 : }
   10256              : 
   10257              : /* Expand memmove of size < mode size which is <= 64.  */
   10258              : 
   10259              : static void
   10260         3339 : ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg,
   10261              :                               rtx srcreg, rtx count_exp,
   10262              :                               unsigned HOST_WIDE_INT min_size,
   10263              :                               machine_mode mode,
   10264              :                               rtx_code_label *done_label)
   10265              : {
   10266         3339 :   bool skip = false;
   10267         3339 :   machine_mode count_mode = counter_mode (count_exp);
   10268              : 
   10269         3339 :   rtx_code_label *between_32_63_label
   10270         3339 :     = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr;
   10271              :   /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64.  */
   10272            3 :   if (between_32_63_label)
   10273              :     {
   10274            3 :       if (min_size && min_size >= 32)
   10275              :         {
   10276            1 :           emit_jump_insn (gen_jump (between_32_63_label));
   10277            1 :           emit_barrier ();
   10278            1 :           skip = true;
   10279              :         }
   10280              :       else
   10281            2 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU,
   10282              :                                  nullptr, count_mode, 1,
   10283              :                                  between_32_63_label);
   10284              :     }
   10285              : 
   10286            3 :   rtx_code_label *between_16_31_label
   10287         3338 :     = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr;
   10288              :   /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31.  */
   10289            4 :   if (between_16_31_label)
   10290              :     {
   10291            4 :       if (min_size && min_size >= 16)
   10292              :         {
   10293            2 :           emit_jump_insn (gen_jump (between_16_31_label));
   10294            2 :           emit_barrier ();
   10295            2 :           skip = true;
   10296              :         }
   10297              :       else
   10298            2 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU,
   10299              :                                  nullptr, count_mode, 1,
   10300              :                                  between_16_31_label);
   10301              :     }
   10302              : 
   10303            2 :   rtx_code_label *between_8_15_label
   10304         6673 :     = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr;
   10305              :   /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15.  */
   10306         2231 :   if (between_8_15_label)
   10307              :     {
   10308         2231 :       if (min_size && min_size >= 8)
   10309              :         {
   10310          150 :           emit_jump_insn (gen_jump (between_8_15_label));
   10311          150 :           emit_barrier ();
   10312          150 :           skip = true;
   10313              :         }
   10314              :       else
   10315         2081 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU,
   10316              :                                  nullptr, count_mode, 1,
   10317              :                                  between_8_15_label);
   10318              :     }
   10319              : 
   10320          150 :   rtx_code_label *between_4_7_label
   10321         6375 :     = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr;
   10322              :   /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7.  */
   10323         2513 :   if (between_4_7_label)
   10324              :     {
   10325         2513 :       if (min_size && min_size >= 4)
   10326              :         {
   10327          180 :           emit_jump_insn (gen_jump (between_4_7_label));
   10328          180 :           emit_barrier ();
   10329          180 :           skip = true;
   10330              :         }
   10331              :       else
   10332         2333 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU,
   10333              :                                  nullptr, count_mode, 1,
   10334              :                                  between_4_7_label);
   10335              :     }
   10336              : 
   10337          180 :   rtx_code_label *between_2_3_label
   10338         6165 :     = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr;
   10339              :   /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3.  */
   10340         2853 :   if (between_2_3_label)
   10341              :     {
   10342         2853 :       if (min_size && min_size >= 2)
   10343              :         {
   10344          290 :           emit_jump_insn (gen_jump (between_2_3_label));
   10345          290 :           emit_barrier ();
   10346          290 :           skip = true;
   10347              :         }
   10348              :       else
   10349         2563 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT,
   10350              :                                  nullptr, count_mode, 1,
   10351              :                                  between_2_3_label);
   10352              :     }
   10353              : 
   10354         3339 :   if (!skip)
   10355              :     {
   10356         2716 :       rtx_code_label *zero_label
   10357         2716 :         = min_size == 0 ? gen_label_rtx () : nullptr;
   10358              :       /* Skip if size == 0.  */
   10359         1661 :       if (zero_label)
   10360         1661 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT,
   10361              :                                  nullptr, count_mode, 1,
   10362              :                                  zero_label,
   10363              :                                  profile_probability::unlikely ());
   10364              : 
   10365              :       /* Move 1 byte.  */
   10366         2716 :       rtx tmp0 = gen_reg_rtx (QImode);
   10367         2716 :       rtx srcmem = change_address (src, QImode, srcreg);
   10368         2716 :       emit_move_insn (tmp0, srcmem);
   10369         2716 :       rtx destmem = change_address (dst, QImode, destreg);
   10370         2716 :       emit_move_insn (destmem, tmp0);
   10371              : 
   10372         2716 :       if (zero_label)
   10373         1661 :         emit_label (zero_label);
   10374              : 
   10375         2716 :       emit_jump_insn (gen_jump (done_label));
   10376         2716 :       emit_barrier ();
   10377              :     }
   10378              : 
   10379         3339 :   if (between_32_63_label)
   10380              :     {
   10381            3 :       emit_label (between_32_63_label);
   10382            3 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10383              :                                              count_exp, OImode, 2);
   10384            3 :       emit_jump_insn (gen_jump (done_label));
   10385            3 :       emit_barrier ();
   10386              :     }
   10387              : 
   10388         3339 :   if (between_16_31_label)
   10389              :     {
   10390            4 :       emit_label (between_16_31_label);
   10391            4 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10392              :                                              count_exp, TImode, 2);
   10393            4 :       emit_jump_insn (gen_jump (done_label));
   10394            4 :       emit_barrier ();
   10395              :     }
   10396              : 
   10397         3339 :   if (between_8_15_label)
   10398              :     {
   10399         2231 :       emit_label (between_8_15_label);
   10400         2231 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10401              :                                              count_exp, DImode, 2);
   10402         2231 :       emit_jump_insn (gen_jump (done_label));
   10403         2231 :       emit_barrier ();
   10404              :     }
   10405              : 
   10406         3339 :   if (between_4_7_label)
   10407              :     {
   10408         2513 :       emit_label (between_4_7_label);
   10409         2513 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10410              :                                              count_exp, SImode, 2);
   10411         2513 :       emit_jump_insn (gen_jump (done_label));
   10412         2513 :       emit_barrier ();
   10413              :     }
   10414              : 
   10415         3339 :   if (between_2_3_label)
   10416              :     {
   10417         2853 :       emit_label (between_2_3_label);
   10418         2853 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10419              :                                              count_exp, HImode, 2);
   10420         2853 :       emit_jump_insn (gen_jump (done_label));
   10421         2853 :       emit_barrier ();
   10422              :     }
   10423         3339 : }
   10424              : 
   10425              : /* Expand movmem with overlapping unaligned loads and stores:
   10426              :    1. Load all sources into registers and store them together to avoid
   10427              :       possible address overlap between source and destination.
   10428              :    2. For known size, first try to fully unroll with 8 registers.
   10429              :    3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first
   10430              :       and then store them together.
   10431              :    4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources
   10432              :       into 4 registers first and then store them together.
   10433              :    5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources
   10434              :       into 8 registers first and then store them together.
   10435              :    6. For size > 8 * MOVE_MAX,
   10436              :       a. If address of destination > address of source, copy backward
   10437              :          with a 4 * MOVE_MAX loop with unaligned loads and stores.  Load
   10438              :          the first 4 * MOVE_MAX into 4 registers before the loop and
   10439              :          store them after the loop to support overlapping addresses.
   10440              :       b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned
   10441              :          loads and stores.  Load the last 4 * MOVE_MAX into 4 registers
   10442              :          before the loop and store them after the loop to support
   10443              :          overlapping addresses.
   10444              :  */
   10445              : 
   10446              : bool
   10447        17209 : ix86_expand_movmem (rtx operands[])
   10448              : {
   10449              :   /* Since there are much less registers available in 32-bit mode, don't
   10450              :      inline movmem in 32-bit mode.  */
   10451        17209 :   if (!TARGET_64BIT)
   10452              :     return false;
   10453              : 
   10454        14831 :   rtx dst = operands[0];
   10455        14831 :   rtx src = operands[1];
   10456        14831 :   rtx count_exp = operands[2];
   10457        14831 :   rtx expected_size_exp = operands[5];
   10458        14831 :   rtx min_size_exp = operands[6];
   10459        14831 :   rtx probable_max_size_exp = operands[8];
   10460        14831 :   unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U;
   10461        14831 :   HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U;
   10462        14831 :   unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
   10463        14831 :   unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
   10464              : 
   10465        14831 :   if (CONST_INT_P (count_exp))
   10466              :     {
   10467         2285 :       min_size = probable_max_size = count = expected_size
   10468         2285 :         = INTVAL (count_exp);
   10469              :       /* When COUNT is 0, there is nothing to do.  */
   10470         2285 :       if (!count)
   10471              :         return true;
   10472              :     }
   10473              :   else
   10474              :     {
   10475        12546 :       if (min_size_exp)
   10476        12546 :         min_size = INTVAL (min_size_exp);
   10477        12546 :       if (probable_max_size_exp)
   10478         9138 :         probable_max_size = INTVAL (probable_max_size_exp);
   10479        12546 :       if (CONST_INT_P (expected_size_exp))
   10480        12546 :         expected_size = INTVAL (expected_size_exp);
   10481              :      }
   10482              : 
   10483              :   /* Make sure we don't need to care about overflow later on.  */
   10484        14831 :   if (count > (HOST_WIDE_INT_1U << 30))
   10485              :     return false;
   10486              : 
   10487        14795 :   addr_space_t dst_as = MEM_ADDR_SPACE (dst);
   10488        14795 :   addr_space_t src_as = MEM_ADDR_SPACE (src);
   10489        14795 :   int dynamic_check;
   10490        14795 :   bool noalign;
   10491        14795 :   enum stringop_alg alg = decide_alg (count, expected_size, min_size,
   10492              :                                       probable_max_size, false, false,
   10493              :                                       dst_as, src_as, &dynamic_check,
   10494              :                                       &noalign, false);
   10495        14795 :   if (alg == libcall)
   10496              :     return false;
   10497              : 
   10498         6104 :   rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
   10499         6104 :   rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
   10500              : 
   10501         6104 :   unsigned int move_max = MOVE_MAX;
   10502         6104 :   machine_mode mode = smallest_int_mode_for_size
   10503         6104 :     (move_max * BITS_PER_UNIT).require ();
   10504         6104 :   if (probable_max_size && probable_max_size < move_max)
   10505              :     {
   10506              :       /* Get a usable MOVE_MAX.  */
   10507         3278 :       mode = smallest_int_mode_for_size
   10508         3278 :         (probable_max_size * BITS_PER_UNIT).require ();
   10509              :       /* Reduce MOVE_MAX by half so that MOVE_MAX can be used.  */
   10510         6556 :       if (GET_MODE_SIZE (mode) > probable_max_size)
   10511         2795 :         mode = smallest_int_mode_for_size
   10512         2795 :           (GET_MODE_BITSIZE (mode) / 2).require ();
   10513         6556 :       move_max = GET_MODE_SIZE (mode);
   10514              :     }
   10515              : 
   10516              :   /* Try to fully unroll memmove of known size first.  */
   10517         6104 :   if (count
   10518         6104 :       && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count,
   10519              :                                     mode))
   10520              :     return true;
   10521              : 
   10522         3973 :   rtx_code_label *done_label = gen_label_rtx ();
   10523              : 
   10524         3973 :   rtx_code_label *less_vec_label = nullptr;
   10525         3973 :   if (min_size == 0 || min_size < move_max)
   10526         3339 :     less_vec_label = gen_label_rtx ();
   10527              : 
   10528         3973 :   machine_mode count_mode = counter_mode (count_exp);
   10529              : 
   10530              :   /* Jump to LESS_VEC_LABEL if size < MOVE_MAX.  */
   10531         3973 :   if (less_vec_label)
   10532         3339 :     emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU,
   10533              :                              nullptr, count_mode, 1,
   10534              :                              less_vec_label);
   10535              : 
   10536         3973 :   rtx_code_label *more_2x_vec_label = nullptr;
   10537         3973 :   if (probable_max_size == 0 || probable_max_size > 2 * move_max)
   10538         1828 :     more_2x_vec_label = gen_label_rtx ();
   10539              : 
   10540              :   /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX.  */
   10541         1828 :   if (more_2x_vec_label)
   10542         1828 :     emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU,
   10543              :                              nullptr, count_mode, 1,
   10544              :                              more_2x_vec_label);
   10545              : 
   10546         3973 :   if (min_size == 0 || min_size <= 2 * move_max)
   10547              :     {
   10548              :       /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX.  */
   10549         3949 :       ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
   10550              :                                              count_exp, mode, 2);
   10551         3949 :       emit_jump_insn (gen_jump (done_label));
   10552         3949 :       emit_barrier ();
   10553              :     }
   10554              : 
   10555         3973 :   if (less_vec_label)
   10556              :     {
   10557              :       /* Size < MOVE_MAX.  */
   10558         3339 :       emit_label (less_vec_label);
   10559         3339 :       ix86_expand_less_move_movmem (dst, src, destreg, srcreg,
   10560              :                                     count_exp, min_size, mode,
   10561              :                                     done_label);
   10562         3339 :       emit_jump_insn (gen_jump (done_label));
   10563         3339 :       emit_barrier ();
   10564              :     }
   10565              : 
   10566         3973 :   if (more_2x_vec_label)
   10567              :     {
   10568              :       /* Size > 2 * MOVE_MAX and destination may overlap with source.  */
   10569         1828 :       emit_label (more_2x_vec_label);
   10570              : 
   10571         1828 :       rtx_code_label *more_8x_vec_label = nullptr;
   10572         1828 :       if (probable_max_size == 0 || probable_max_size > 8 * move_max)
   10573         1472 :         more_8x_vec_label = gen_label_rtx ();
   10574              : 
   10575              :       /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX.  */
   10576         1472 :       if (more_8x_vec_label)
   10577         1472 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU,
   10578              :                                  nullptr, count_mode, 1,
   10579              :                                  more_8x_vec_label);
   10580              : 
   10581         1828 :       rtx_code_label *last_4x_vec_label = nullptr;
   10582         1828 :       if (min_size == 0 || min_size < 4 * move_max)
   10583         1810 :         last_4x_vec_label = gen_label_rtx ();
   10584              : 
   10585              :       /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX.  */
   10586         1810 :       if (last_4x_vec_label)
   10587         1810 :         emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU,
   10588              :                                  nullptr, count_mode, 1,
   10589              :                                  last_4x_vec_label);
   10590              : 
   10591         1828 :       if (probable_max_size == 0 || probable_max_size > 4 * move_max)
   10592              :         {
   10593              :           /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX.  */
   10594         1540 :           ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
   10595              :                                                  srcreg, count_exp,
   10596              :                                                  mode, 8);
   10597         1540 :           emit_jump_insn (gen_jump (done_label));
   10598         1540 :           emit_barrier ();
   10599              :         }
   10600              : 
   10601         1828 :       if (last_4x_vec_label)
   10602              :         {
   10603              :           /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX.  */
   10604         1810 :           emit_label (last_4x_vec_label);
   10605         1810 :           ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
   10606              :                                                  srcreg, count_exp,
   10607              :                                                  mode, 4);
   10608         1810 :           emit_jump_insn (gen_jump (done_label));
   10609         1810 :           emit_barrier ();
   10610              :         }
   10611              : 
   10612         1828 :       if (more_8x_vec_label)
   10613              :         {
   10614              :           /* Size > 8 * MOVE_MAX.  */
   10615         1472 :           emit_label (more_8x_vec_label);
   10616              : 
   10617         1472 :           rtx loop_count = gen_reg_rtx (count_mode);
   10618         1472 :           emit_move_insn (loop_count, count_exp);
   10619              : 
   10620              :           /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is
   10621              :              lower than destination address.  */
   10622         1472 :           rtx_code_label *more_8x_vec_backward_label = gen_label_rtx ();
   10623         1472 :           emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr,
   10624         1472 :                                    GET_MODE (destreg), 1,
   10625              :                                    more_8x_vec_backward_label);
   10626              : 
   10627              :           /* Skip if source == destination which is less common.  */
   10628         1472 :           emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr,
   10629         1472 :                                    GET_MODE (destreg), 1, done_label,
   10630              :                                    profile_probability::unlikely ());
   10631              : 
   10632         1472 :           rtx base_destreg = gen_reg_rtx (GET_MODE (destreg));
   10633         1472 :           emit_move_insn (base_destreg, destreg);
   10634              : 
   10635              :           /* Load the last 4 * MOVE_MAX.  */
   10636         1472 :           rtx regs[4];
   10637         1472 :           ix86_expand_load_movmem (src, srcreg, count_exp, mode,
   10638              :                                    ARRAY_SIZE (regs), regs, true);
   10639              : 
   10640         1472 :           rtx srcmem = change_address (src, mode, srcreg);
   10641         1472 :           rtx destmem = change_address (dst, mode, destreg);
   10642              : 
   10643              :           /* Copy forward with a 4 * MOVE_MAX loop.  */
   10644         1472 :           rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx ();
   10645         1472 :           emit_label (loop_4x_vec_forward_label);
   10646              : 
   10647         1472 :           ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true);
   10648              : 
   10649         1472 :           rtx tmp;
   10650         1472 :           rtx delta = GEN_INT (4 * MOVE_MAX);
   10651              : 
   10652              :           /* Decrement LOOP_COUNT by 4 * MOVE_MAX.  */
   10653         1472 :           tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
   10654              :                                      loop_count, delta, nullptr, 1,
   10655              :                                      OPTAB_DIRECT);
   10656         1472 :           if (tmp != loop_count)
   10657         1472 :             emit_move_insn (loop_count, tmp);
   10658              : 
   10659              :           /* Increment DESTREG and SRCREG by 4 * MOVE_MAX.  */
   10660         1472 :           tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
   10661              :                                      destreg, delta, nullptr, 1,
   10662              :                                      OPTAB_DIRECT);
   10663         1472 :           if (tmp != destreg)
   10664         1472 :             emit_move_insn (destreg, tmp);
   10665         1472 :           tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
   10666              :                                      delta, nullptr, 1, OPTAB_DIRECT);
   10667         1472 :           if (tmp != srcreg)
   10668         1472 :             emit_move_insn (srcreg, tmp);
   10669              : 
   10670              :           /* Stop if LOOP_EXP <= 4 * MOVE_MAX.  */
   10671         1472 :           emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
   10672         1472 :                                    GET_MODE (loop_count), 1,
   10673              :                                    loop_4x_vec_forward_label);
   10674              : 
   10675              :           /* Store the last 4 * MOVE_MAX.  */
   10676         1472 :           ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
   10677              :                                     ARRAY_SIZE (regs), regs, true);
   10678              : 
   10679         1472 :           emit_jump_insn (gen_jump (done_label));
   10680         1472 :           emit_barrier ();
   10681              : 
   10682              :           /* Copy backward with a 4 * MOVE_MAX loop.  */
   10683         1472 :           emit_label (more_8x_vec_backward_label);
   10684              : 
   10685         1472 :           base_destreg = gen_reg_rtx (GET_MODE (destreg));
   10686         1472 :           emit_move_insn (base_destreg, destreg);
   10687              : 
   10688              :           /* Load the first 4 * MOVE_MAX.  */
   10689         1472 :           ix86_expand_load_movmem (src, srcreg, count_exp, mode,
   10690              :                                    ARRAY_SIZE (regs), regs, false);
   10691              : 
   10692              :           /* Increment DESTREG and SRCREG by COUNT_EXP.  */
   10693         1472 :           tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
   10694              :                                      destreg, count_exp, nullptr, 1,
   10695              :                                      OPTAB_DIRECT);
   10696         1472 :           if (tmp != destreg)
   10697         1472 :             emit_move_insn (destreg, tmp);
   10698         1472 :           tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
   10699              :                                      count_exp, nullptr, 1, OPTAB_DIRECT);
   10700         1472 :           if (tmp != srcreg)
   10701         1472 :             emit_move_insn (srcreg, tmp);
   10702              : 
   10703         1472 :           srcmem = change_address (src, mode, srcreg);
   10704         1472 :           destmem = change_address (dst, mode, destreg);
   10705         2944 :           rtx step = GEN_INT (-GET_MODE_SIZE (mode));
   10706         2944 :           srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
   10707         2944 :           destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
   10708              : 
   10709         1472 :           rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx ();
   10710         1472 :           emit_label (loop_4x_vec_backward_label);
   10711              : 
   10712         1472 :           ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false);
   10713              : 
   10714              :           /* Decrement LOOP_COUNT by 4 * MOVE_MAX.  */
   10715         1472 :           tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
   10716              :                                      loop_count, delta, nullptr, 1,
   10717              :                                      OPTAB_DIRECT);
   10718         1472 :           if (tmp != loop_count)
   10719         1472 :             emit_move_insn (loop_count, tmp);
   10720              : 
   10721              :           /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX.  */
   10722         1472 :           tmp = expand_simple_binop (GET_MODE (destreg), MINUS,
   10723              :                                      destreg, delta, nullptr, 1,
   10724              :                                      OPTAB_DIRECT);
   10725         1472 :           if (tmp != destreg)
   10726         1472 :             emit_move_insn (destreg, tmp);
   10727         1472 :           tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg,
   10728              :                                      delta, nullptr, 1, OPTAB_DIRECT);
   10729         1472 :           if (tmp != srcreg)
   10730         1472 :             emit_move_insn (srcreg, tmp);
   10731              : 
   10732              :           /* Stop if LOOP_EXP <= 4 * MOVE_MAX.  */
   10733         1472 :           emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
   10734         1472 :                                    GET_MODE (loop_count), 1,
   10735              :                                    loop_4x_vec_backward_label);
   10736              : 
   10737              :           /* Store the first 4 * MOVE_MAX.  */
   10738         1472 :           ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
   10739              :                                     ARRAY_SIZE (regs), regs, false);
   10740              : 
   10741         1472 :           emit_jump_insn (gen_jump (done_label));
   10742         1472 :           emit_barrier ();
   10743              :         }
   10744              :     }
   10745              : 
   10746         3973 :   emit_label (done_label);
   10747              : 
   10748         3973 :   return true;
   10749              : }
   10750              : 
   10751              : /* Expand cmpstrn or memcmp.  */
   10752              : 
   10753              : bool
   10754       170432 : ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
   10755              :                                rtx length, rtx align, bool is_cmpstrn)
   10756              : {
   10757              :   /* Expand strncmp and memcmp only with -minline-all-stringops since
   10758              :      "repz cmpsb" can be much slower than strncmp and memcmp functions
   10759              :      implemented with vector instructions, see
   10760              : 
   10761              :      https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
   10762              :    */
   10763       170432 :   if (!TARGET_INLINE_ALL_STRINGOPS)
   10764              :     return false;
   10765              : 
   10766              :   /* Can't use this if the user has appropriated ecx, esi or edi.  */
   10767         5780 :   if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
   10768              :     return false;
   10769              : 
   10770         5780 :   if (is_cmpstrn)
   10771              :     {
   10772              :       /* For strncmp, length is the maximum length, which can be larger
   10773              :          than actual string lengths.  We can expand the cmpstrn pattern
   10774              :          to "repz cmpsb" only if one of the strings is a constant so
   10775              :          that expand_builtin_strncmp() can write the length argument to
   10776              :          be the minimum of the const string length and the actual length
   10777              :          argument.  Otherwise, "repz cmpsb" may pass the 0 byte.  */
   10778           69 :       tree t1 = MEM_EXPR (src1);
   10779           69 :       tree t2 = MEM_EXPR (src2);
   10780          138 :       if (!((t1 && TREE_CODE (t1) == MEM_REF
   10781           69 :              && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
   10782            0 :              && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
   10783              :                  == STRING_CST))
   10784           69 :             || (t2 && TREE_CODE (t2) == MEM_REF
   10785           69 :                 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
   10786           69 :                 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
   10787              :                     == STRING_CST))))
   10788              :         return false;
   10789              :     }
   10790              : 
   10791         5780 :   rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
   10792         5780 :   rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
   10793         5780 :   if (addr1 != XEXP (src1, 0))
   10794         5780 :     src1 = replace_equiv_address_nv (src1, addr1);
   10795         5780 :   if (addr2 != XEXP (src2, 0))
   10796         5780 :     src2 = replace_equiv_address_nv (src2, addr2);
   10797              : 
   10798              :   /* NB: Make a copy of the data length to avoid changing the original
   10799              :      data length by cmpstrnqi patterns.  */
   10800         5780 :   length = ix86_zero_extend_to_Pmode (length);
   10801         8673 :   rtx lengthreg = gen_reg_rtx (Pmode);
   10802         5780 :   emit_move_insn (lengthreg, length);
   10803              : 
   10804              :   /* If we are testing strict equality, we can use known alignment to
   10805              :      good advantage.  This may be possible with combine, particularly
   10806              :      once cc0 is dead.  */
   10807         5780 :   if (CONST_INT_P (length))
   10808              :     {
   10809            0 :       if (length == const0_rtx)
   10810              :         {
   10811            0 :           emit_move_insn (result, const0_rtx);
   10812            0 :           return true;
   10813              :         }
   10814            0 :       emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
   10815              :                                      src1, src2));
   10816              :     }
   10817              :   else
   10818              :     {
   10819         8673 :       emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
   10820         5780 :       emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
   10821              :                                   src1, src2));
   10822              :     }
   10823              : 
   10824         5780 :   rtx out = gen_lowpart (QImode, result);
   10825         5780 :   emit_insn (gen_cmpintqi (out));
   10826         5780 :   emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
   10827              : 
   10828         5780 :   return true;
   10829              : }
   10830              : 
   10831              : /* Expand the appropriate insns for doing strlen if not just doing
   10832              :    repnz; scasb
   10833              : 
   10834              :    out = result, initialized with the start address
   10835              :    align_rtx = alignment of the address.
   10836              :    scratch = scratch register, initialized with the startaddress when
   10837              :         not aligned, otherwise undefined
   10838              : 
   10839              :    This is just the body. It needs the initializations mentioned above and
   10840              :    some address computing at the end.  These things are done in i386.md.  */
   10841              : 
   10842              : static void
   10843           11 : ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
   10844              : {
   10845           11 :   int align;
   10846           11 :   rtx tmp;
   10847           11 :   rtx_code_label *align_2_label = NULL;
   10848           11 :   rtx_code_label *align_3_label = NULL;
   10849           11 :   rtx_code_label *align_4_label = gen_label_rtx ();
   10850           11 :   rtx_code_label *end_0_label = gen_label_rtx ();
   10851           11 :   rtx mem;
   10852           11 :   rtx tmpreg = gen_reg_rtx (SImode);
   10853           11 :   rtx scratch = gen_reg_rtx (SImode);
   10854           11 :   rtx cmp;
   10855              : 
   10856           11 :   align = 0;
   10857           11 :   if (CONST_INT_P (align_rtx))
   10858           11 :     align = INTVAL (align_rtx);
   10859              : 
   10860              :   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
   10861              : 
   10862              :   /* Is there a known alignment and is it less than 4?  */
   10863           11 :   if (align < 4)
   10864              :     {
   10865           15 :       rtx scratch1 = gen_reg_rtx (Pmode);
   10866           11 :       emit_move_insn (scratch1, out);
   10867              :       /* Is there a known alignment and is it not 2? */
   10868           11 :       if (align != 2)
   10869              :         {
   10870           11 :           align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
   10871           11 :           align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
   10872              : 
   10873              :           /* Leave just the 3 lower bits.  */
   10874           15 :           align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
   10875              :                                     NULL_RTX, 0, OPTAB_WIDEN);
   10876              : 
   10877           15 :           emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
   10878           11 :                                    Pmode, 1, align_4_label);
   10879           15 :           emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
   10880           11 :                                    Pmode, 1, align_2_label);
   10881           15 :           emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
   10882           11 :                                    Pmode, 1, align_3_label);
   10883              :         }
   10884              :       else
   10885              :         {
   10886              :           /* Since the alignment is 2, we have to check 2 or 0 bytes;
   10887              :              check if is aligned to 4 - byte.  */
   10888              : 
   10889            0 :           align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
   10890              :                                     NULL_RTX, 0, OPTAB_WIDEN);
   10891              : 
   10892            0 :           emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
   10893            0 :                                    Pmode, 1, align_4_label);
   10894              :         }
   10895              : 
   10896           11 :       mem = change_address (src, QImode, out);
   10897              : 
   10898              :       /* Now compare the bytes.  */
   10899              : 
   10900              :       /* Compare the first n unaligned byte on a byte per byte basis.  */
   10901           11 :       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
   10902              :                                QImode, 1, end_0_label);
   10903              : 
   10904              :       /* Increment the address.  */
   10905           11 :       emit_insn (gen_add2_insn (out, const1_rtx));
   10906              : 
   10907              :       /* Not needed with an alignment of 2 */
   10908           11 :       if (align != 2)
   10909              :         {
   10910           11 :           emit_label (align_2_label);
   10911              : 
   10912           11 :           emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
   10913              :                                    end_0_label);
   10914              : 
   10915           11 :           emit_insn (gen_add2_insn (out, const1_rtx));
   10916              : 
   10917           11 :           emit_label (align_3_label);
   10918              :         }
   10919              : 
   10920           11 :       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
   10921              :                                end_0_label);
   10922              : 
   10923           11 :       emit_insn (gen_add2_insn (out, const1_rtx));
   10924              :     }
   10925              : 
   10926              :   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
   10927              :      align this loop.  It gives only huge programs, but does not help to
   10928              :      speed up.  */
   10929           11 :   emit_label (align_4_label);
   10930              : 
   10931           11 :   mem = change_address (src, SImode, out);
   10932           11 :   emit_move_insn (scratch, mem);
   10933           11 :   emit_insn (gen_add2_insn (out, GEN_INT (4)));
   10934              : 
   10935              :   /* This formula yields a nonzero result iff one of the bytes is zero.
   10936              :      This saves three branches inside loop and many cycles.  */
   10937              : 
   10938           11 :   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
   10939           11 :   emit_insn (gen_one_cmplsi2 (scratch, scratch));
   10940           11 :   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
   10941           11 :   emit_insn (gen_andsi3 (tmpreg, tmpreg,
   10942              :                          gen_int_mode (0x80808080, SImode)));
   10943           11 :   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
   10944              :                            align_4_label);
   10945              : 
   10946           11 :   if (TARGET_CMOVE)
   10947              :     {
   10948           11 :        rtx reg = gen_reg_rtx (SImode);
   10949           15 :        rtx reg2 = gen_reg_rtx (Pmode);
   10950           11 :        emit_move_insn (reg, tmpreg);
   10951           11 :        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
   10952              : 
   10953              :        /* If zero is not in the first two bytes, move two bytes forward.  */
   10954           11 :        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
   10955           11 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   10956           11 :        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
   10957           11 :        emit_insn (gen_rtx_SET (tmpreg,
   10958              :                                gen_rtx_IF_THEN_ELSE (SImode, tmp,
   10959              :                                                      reg,
   10960              :                                                      tmpreg)));
   10961              :        /* Emit lea manually to avoid clobbering of flags.  */
   10962           15 :        emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
   10963              : 
   10964           11 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   10965           11 :        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
   10966           15 :        emit_insn (gen_rtx_SET (out,
   10967              :                                gen_rtx_IF_THEN_ELSE (Pmode, tmp,
   10968              :                                                      reg2,
   10969              :                                                      out)));
   10970           11 :     }
   10971              :   else
   10972              :     {
   10973            0 :        rtx_code_label *end_2_label = gen_label_rtx ();
   10974              :        /* Is zero in the first two bytes? */
   10975              : 
   10976            0 :        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
   10977            0 :        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   10978            0 :        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
   10979            0 :        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   10980              :                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
   10981              :                             pc_rtx);
   10982            0 :        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   10983            0 :        JUMP_LABEL (tmp) = end_2_label;
   10984              : 
   10985              :        /* Not in the first two.  Move two bytes forward.  */
   10986            0 :        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
   10987            0 :        emit_insn (gen_add2_insn (out, const2_rtx));
   10988              : 
   10989            0 :        emit_label (end_2_label);
   10990              : 
   10991              :     }
   10992              : 
   10993              :   /* Avoid branch in fixing the byte.  */
   10994           11 :   tmpreg = gen_lowpart (QImode, tmpreg);
   10995           11 :   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
   10996           11 :   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
   10997           11 :   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
   10998           15 :   emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
   10999              : 
   11000           11 :   emit_label (end_0_label);
   11001           11 : }
   11002              : 
   11003              : /* Expand strlen.  */
   11004              : 
   11005              : bool
   11006        13626 : ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
   11007              : {
   11008        13626 : if (TARGET_UNROLL_STRLEN
   11009        13626 :            && TARGET_INLINE_ALL_STRINGOPS
   11010           11 :            && eoschar == const0_rtx
   11011           11 :            && optimize > 1)
   11012              :     {
   11013              :       /* The generic case of strlen expander is long.  Avoid it's
   11014              :          expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
   11015           15 :       rtx addr = force_reg (Pmode, XEXP (src, 0));
   11016              :       /* Well it seems that some optimizer does not combine a call like
   11017              :          foo(strlen(bar), strlen(bar));
   11018              :          when the move and the subtraction is done here.  It does calculate
   11019              :          the length just once when these instructions are done inside of
   11020              :          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
   11021              :          often used and I use one fewer register for the lifetime of
   11022              :          output_strlen_unroll() this is better.  */
   11023              : 
   11024           11 :       emit_move_insn (out, addr);
   11025              : 
   11026           11 :       ix86_expand_strlensi_unroll_1 (out, src, align);
   11027              : 
   11028              :       /* strlensi_unroll_1 returns the address of the zero at the end of
   11029              :          the string, like memchr(), so compute the length by subtracting
   11030              :          the start address.  */
   11031           11 :       emit_insn (gen_sub2_insn (out, addr));
   11032           11 :       return true;
   11033              :     }
   11034              :   else
   11035              :     return false;
   11036              : }
   11037              : 
   11038              : /* For given symbol (function) construct code to compute address of it's PLT
   11039              :    entry in large x86-64 PIC model.  */
   11040              : 
   11041              : static rtx
   11042           31 : construct_plt_address (rtx symbol)
   11043              : {
   11044           31 :   rtx tmp, unspec;
   11045              : 
   11046           31 :   gcc_assert (SYMBOL_REF_P (symbol));
   11047           31 :   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
   11048           31 :   gcc_assert (Pmode == DImode);
   11049              : 
   11050           31 :   tmp = gen_reg_rtx (Pmode);
   11051           31 :   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
   11052              : 
   11053           31 :   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
   11054           31 :   emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
   11055           31 :   return tmp;
   11056              : }
   11057              : 
   11058              : /* Additional registers that are clobbered by SYSV calls.  */
   11059              : 
   11060              : static int const x86_64_ms_sysv_extra_clobbered_registers
   11061              :                  [NUM_X86_64_MS_CLOBBERED_REGS] =
   11062              : {
   11063              :   SI_REG, DI_REG,
   11064              :   XMM6_REG, XMM7_REG,
   11065              :   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
   11066              :   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
   11067              : };
   11068              : 
   11069              : rtx_insn *
   11070      6195436 : ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
   11071              :                   rtx callarg2,
   11072              :                   rtx pop, bool sibcall)
   11073              : {
   11074      6195436 :   rtx vec[3];
   11075      6195436 :   rtx use = NULL, call;
   11076      6195436 :   unsigned int vec_len = 0;
   11077      6195436 :   tree fndecl;
   11078      6195436 :   bool call_no_callee_saved_registers = false;
   11079              : 
   11080      6195436 :   if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
   11081              :     {
   11082      6007221 :       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
   11083      6007221 :       if (fndecl)
   11084              :         {
   11085      5747305 :           if (lookup_attribute ("interrupt",
   11086      5747305 :                                 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
   11087            1 :             error ("interrupt service routine cannot be called directly");
   11088      5747304 :           else if (ix86_type_no_callee_saved_registers_p (TREE_TYPE (fndecl)))
   11089      5747305 :             call_no_callee_saved_registers = true;
   11090      5747305 :           if (fndecl == current_function_decl
   11091      5747305 :               && decl_binds_to_current_def_p (fndecl))
   11092        11092 :             cfun->machine->recursive_function = true;
   11093              :         }
   11094              :     }
   11095              :   else
   11096              :     {
   11097       188215 :       if (MEM_P (fnaddr))
   11098              :         {
   11099       188215 :           tree mem_expr = MEM_EXPR (fnaddr);
   11100       188215 :           if (mem_expr != nullptr
   11101       188170 :               && TREE_CODE (mem_expr) == MEM_REF
   11102       376385 :               && ix86_type_no_callee_saved_registers_p (TREE_TYPE (mem_expr)))
   11103              :             call_no_callee_saved_registers = true;
   11104              :         }
   11105              : 
   11106              :       fndecl = NULL_TREE;
   11107              :     }
   11108              : 
   11109      6195436 :   if (pop == const0_rtx)
   11110            0 :     pop = NULL;
   11111      6195436 :   gcc_assert (!TARGET_64BIT || !pop);
   11112              : 
   11113      6195436 :   rtx addr = XEXP (fnaddr, 0);
   11114      6195436 :   if (TARGET_MACHO && !TARGET_64BIT)
   11115              :     {
   11116              : #if TARGET_MACHO
   11117              :       if (flag_pic && SYMBOL_REF_P (XEXP (fnaddr, 0)))
   11118              :         fnaddr = machopic_indirect_call_target (fnaddr);
   11119              : #endif
   11120              :     }
   11121              :   else
   11122              :     {
   11123              :       /* Static functions and indirect calls don't need the pic register.  Also,
   11124              :          check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
   11125              :          it an indirect call.  */
   11126      6195436 :       if (flag_pic
   11127       522294 :           && SYMBOL_REF_P (addr)
   11128      6691305 :           && ix86_call_use_plt_p (addr))
   11129              :         {
   11130       396494 :           if (flag_plt
   11131       396494 :               && (SYMBOL_REF_DECL (addr) == NULL_TREE
   11132       396460 :                   || !lookup_attribute ("noplt",
   11133       396460 :                                         DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
   11134              :             {
   11135       396459 :               if (!TARGET_64BIT
   11136       219199 :                   || (ix86_cmodel == CM_LARGE_PIC
   11137              :                       && DEFAULT_ABI != MS_ABI))
   11138              :                 {
   11139       531811 :                   use_reg (&use, gen_rtx_REG (Pmode,
   11140              :                                               REAL_PIC_OFFSET_TABLE_REGNUM));
   11141       177291 :                   if (ix86_use_pseudo_pic_reg ())
   11142       354551 :                     emit_move_insn (gen_rtx_REG (Pmode,
   11143       177291 :                                                  REAL_PIC_OFFSET_TABLE_REGNUM),
   11144              :                                     pic_offset_table_rtx);
   11145              :                 }
   11146              :             }
   11147           35 :           else if (!TARGET_PECOFF && !TARGET_MACHO)
   11148              :             {
   11149           35 :               if (TARGET_64BIT
   11150           35 :                   && ix86_cmodel == CM_LARGE_PIC
   11151              :                   && DEFAULT_ABI != MS_ABI)
   11152              :                 {
   11153            1 :                   fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
   11154              :                                            UNSPEC_GOT);
   11155            1 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11156            1 :                   fnaddr = force_reg (Pmode, fnaddr);
   11157            1 :                   fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
   11158              :                 }
   11159           34 :               else if (TARGET_64BIT)
   11160              :                 {
   11161           38 :                   fnaddr = gen_rtx_UNSPEC (Pmode,
   11162              :                                            gen_rtvec (1, addr),
   11163              :                                            UNSPEC_GOTPCREL);
   11164           38 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11165              :                 }
   11166              :               else
   11167              :                 {
   11168            0 :                   fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
   11169              :                                            UNSPEC_GOT);
   11170            0 :                   fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   11171            0 :                   fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
   11172              :                                          fnaddr);
   11173              :                 }
   11174           39 :               fnaddr = gen_const_mem (Pmode, fnaddr);
   11175              :               /* Pmode may not be the same as word_mode for x32, which
   11176              :                  doesn't support indirect branch via 32-bit memory slot.
   11177              :                  Since x32 GOT slot is 64 bit with zero upper 32 bits,
   11178              :                  indirect branch via x32 GOT slot is OK.  */
   11179           35 :               if (GET_MODE (fnaddr) != word_mode)
   11180            4 :                 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
   11181           35 :               fnaddr = gen_rtx_MEM (QImode, fnaddr);
   11182              :             }
   11183              :         }
   11184              :     }
   11185              : 
   11186              :   /* Skip setting up RAX register for -mskip-rax-setup when there are no
   11187              :      parameters passed in vector registers.  */
   11188      6195436 :   if (TARGET_64BIT
   11189      5357500 :       && (INTVAL (callarg2) > 0
   11190      5296340 :           || (INTVAL (callarg2) == 0
   11191       316879 :               && (TARGET_SSE || !flag_skip_rax_setup))))
   11192              :     {
   11193       378037 :       rtx al = gen_rtx_REG (QImode, AX_REG);
   11194       378037 :       emit_move_insn (al, callarg2);
   11195       378037 :       use_reg (&use, al);
   11196              :     }
   11197              : 
   11198      6195436 :   if (ix86_cmodel == CM_LARGE_PIC
   11199              :       && !TARGET_PECOFF
   11200           41 :       && MEM_P (fnaddr)
   11201           41 :       && SYMBOL_REF_P (XEXP (fnaddr, 0))
   11202      6195469 :       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
   11203           31 :     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
   11204              :   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
   11205              :      branch via x32 GOT slot is OK.  */
   11206      6195405 :   else if (TARGET_X32
   11207           74 :       && MEM_P (fnaddr)
   11208           74 :       && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
   11209            8 :       && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)
   11210      6195409 :       && !TARGET_INDIRECT_BRANCH_REGISTER)
   11211              :     ;
   11212      6195405 :   else if (sibcall
   11213      6195405 :            ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
   11214      6066056 :            : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
   11215              :     {
   11216          531 :       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
   11217          531 :       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
   11218              :     }
   11219              : 
   11220              :   /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
   11221              :      mask off code pointers here.
   11222              :      TODO: also need to handle indirect jump.  */
   11223      6196474 :   if (ix86_memtag_can_tag_addresses () && !fndecl
   11224      6195460 :       && sanitize_flags_p (SANITIZE_HWADDRESS))
   11225              :     {
   11226           24 :       rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
   11227              :                                                         NULL_RTX);
   11228           24 :       fnaddr = gen_rtx_MEM (QImode, untagged_addr);
   11229              :     }
   11230              : 
   11231      6195436 :   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
   11232              : 
   11233      6195436 :   if (retval)
   11234      2451605 :     call = gen_rtx_SET (retval, call);
   11235      6195436 :   vec[vec_len++] = call;
   11236              : 
   11237      6195436 :   if (pop)
   11238              :     {
   11239       449838 :       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
   11240       224919 :       pop = gen_rtx_SET (stack_pointer_rtx, pop);
   11241       224919 :       vec[vec_len++] = pop;
   11242              :     }
   11243              : 
   11244      6195436 :   static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
   11245              : 
   11246      6195436 :   if ((cfun->machine->call_saved_registers
   11247      6195436 :        == TYPE_NO_CALLER_SAVED_REGISTERS)
   11248      6195436 :       && (!fndecl
   11249          468 :           || (!TREE_THIS_VOLATILE (fndecl)
   11250          186 :               && !lookup_attribute ("no_caller_saved_registers",
   11251          186 :                                     TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
   11252              :     {
   11253          182 :       bool is_64bit_ms_abi = (TARGET_64BIT
   11254          182 :                               && ix86_function_abi (fndecl) == MS_ABI);
   11255          182 :       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
   11256              : 
   11257              :       /* If there are no caller-saved registers, add all registers
   11258              :          that are clobbered by the call which returns.  */
   11259        16926 :       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
   11260        16744 :         if (!fixed_regs[i]
   11261         3242 :             && (ix86_call_used_regs[i] == 1
   11262         1506 :                 || (ix86_call_used_regs[i] & c_mask))
   11263         2150 :             && !STACK_REGNO_P (i)
   11264         2150 :             && !MMX_REGNO_P (i))
   11265         2150 :           clobber_reg (&use,
   11266         2150 :                        gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
   11267              :     }
   11268      5357318 :   else if (TARGET_64BIT_MS_ABI
   11269      6268657 :            && (!callarg2 || INTVAL (callarg2) != -2))
   11270              :     {
   11271              :       unsigned i;
   11272              : 
   11273       861718 :       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
   11274              :         {
   11275       795432 :           int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
   11276       795432 :           machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
   11277              : 
   11278       795432 :           clobber_reg (&use, gen_rtx_REG (mode, regno));
   11279              :         }
   11280              : 
   11281              :       /* Set here, but it may get cleared later.  */
   11282        66286 :       if (TARGET_CALL_MS2SYSV_XLOGUES)
   11283              :         {
   11284         7046 :           if (!TARGET_SSE)
   11285              :             ;
   11286              : 
   11287              :           /* Don't break hot-patched functions.  */
   11288         7046 :           else if (ix86_function_ms_hook_prologue (current_function_decl))
   11289              :             ;
   11290              : 
   11291              :           /* TODO: Cases not yet examined.  */
   11292         7046 :           else if (flag_split_stack)
   11293            0 :             warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
   11294              : 
   11295              :           else
   11296              :             {
   11297         7046 :               gcc_assert (!reload_completed);
   11298         7046 :               cfun->machine->call_ms2sysv = true;
   11299              :             }
   11300              :         }
   11301              :     }
   11302              : 
   11303      6195436 :   if (TARGET_MACHO && TARGET_64BIT && !sibcall
   11304              :       && ((SYMBOL_REF_P (addr) && !SYMBOL_REF_LOCAL_P (addr))
   11305              :           || !fndecl || TREE_PUBLIC (fndecl)))
   11306              :     {
   11307              :       /* We allow public functions defined in a TU to bind locally for PIC
   11308              :          code (the default) on 64bit Mach-O.
   11309              :          If such functions are not inlined, we cannot tell at compile-time if
   11310              :          they will be called via the lazy symbol resolver (this can depend on
   11311              :          options given at link-time).  Therefore, we must assume that the lazy
   11312              :          resolver could be used which clobbers R11 and R10.  */
   11313              :       clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
   11314              :       clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
   11315              :     }
   11316              : 
   11317      6195436 :   if (call_no_callee_saved_registers)
   11318              :     {
   11319              :       /* After calling a no_callee_saved_registers function, all
   11320              :          registers may be clobbered.  Clobber all registers that are
   11321              :          not used by the callee.  */
   11322           59 :       bool is_64bit_ms_abi = (TARGET_64BIT
   11323           59 :                               && ix86_function_abi (fndecl) == MS_ABI);
   11324           59 :       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
   11325         5487 :       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
   11326         5428 :         if (!fixed_regs[i]
   11327         2597 :             && i != HARD_FRAME_POINTER_REGNUM
   11328         2538 :             && !(ix86_call_used_regs[i] == 1
   11329          973 :                  || (ix86_call_used_regs[i] & c_mask))
   11330          295 :             && !STACK_REGNO_P (i)
   11331          295 :             && !MMX_REGNO_P (i))
   11332          295 :           clobber_reg (&use,
   11333          295 :                        gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
   11334              :     }
   11335              : 
   11336      6195436 :   if (vec_len > 1)
   11337       224919 :     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
   11338      6195436 :   rtx_insn *call_insn = emit_call_insn (call);
   11339      6195436 :   if (use)
   11340       595756 :     CALL_INSN_FUNCTION_USAGE (call_insn) = use;
   11341              : 
   11342      6195436 :   return call_insn;
   11343              : }
   11344              : 
   11345              : /* Split simple return with popping POPC bytes from stack to indirect
   11346              :    branch with stack adjustment .  */
   11347              : 
   11348              : void
   11349            0 : ix86_split_simple_return_pop_internal (rtx popc)
   11350              : {
   11351            0 :   struct machine_function *m = cfun->machine;
   11352            0 :   rtx ecx = gen_rtx_REG (SImode, CX_REG);
   11353            0 :   rtx_insn *insn;
   11354              : 
   11355              :   /* There is no "pascal" calling convention in any 64bit ABI.  */
   11356            0 :   gcc_assert (!TARGET_64BIT);
   11357              : 
   11358            0 :   insn = emit_insn (gen_pop (ecx));
   11359            0 :   m->fs.cfa_offset -= UNITS_PER_WORD;
   11360            0 :   m->fs.sp_offset -= UNITS_PER_WORD;
   11361              : 
   11362            0 :   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
   11363            0 :   x = gen_rtx_SET (stack_pointer_rtx, x);
   11364            0 :   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
   11365            0 :   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
   11366            0 :   RTX_FRAME_RELATED_P (insn) = 1;
   11367              : 
   11368            0 :   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
   11369            0 :   x = gen_rtx_SET (stack_pointer_rtx, x);
   11370            0 :   insn = emit_insn (x);
   11371            0 :   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
   11372            0 :   RTX_FRAME_RELATED_P (insn) = 1;
   11373              : 
   11374              :   /* Now return address is in ECX.  */
   11375            0 :   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
   11376            0 : }
   11377              : 
   11378              : /* Errors in the source file can cause expand_expr to return const0_rtx
   11379              :    where we expect a vector.  To avoid crashing, use one of the vector
   11380              :    clear instructions.  */
   11381              : 
   11382              : static rtx
   11383       196132 : safe_vector_operand (rtx x, machine_mode mode)
   11384              : {
   11385            0 :   if (x == const0_rtx)
   11386            0 :     x = CONST0_RTX (mode);
   11387           24 :   return x;
   11388              : }
   11389              : 
   11390              : /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
   11391              : 
   11392              : static rtx
   11393         8970 : ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
   11394              : {
   11395         8970 :   rtx pat;
   11396         8970 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11397         8970 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11398         8970 :   rtx op0 = expand_normal (arg0);
   11399         8970 :   rtx op1 = expand_normal (arg1);
   11400         8970 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11401         8970 :   machine_mode mode0 = insn_data[icode].operand[1].mode;
   11402         8970 :   machine_mode mode1 = insn_data[icode].operand[2].mode;
   11403              : 
   11404         8970 :   if (VECTOR_MODE_P (mode0))
   11405         8959 :     op0 = safe_vector_operand (op0, mode0);
   11406         8970 :   if (VECTOR_MODE_P (mode1))
   11407         8823 :     op1 = safe_vector_operand (op1, mode1);
   11408              : 
   11409         2842 :   if (optimize || !target
   11410         2842 :       || GET_MODE (target) != tmode
   11411        11812 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11412         6181 :     target = gen_reg_rtx (tmode);
   11413              : 
   11414         8970 :   if (GET_MODE (op1) == SImode && mode1 == TImode)
   11415              :     {
   11416            0 :       rtx x = gen_reg_rtx (V4SImode);
   11417            0 :       emit_insn (gen_sse2_loadd (x, op1));
   11418            0 :       op1 = gen_lowpart (TImode, x);
   11419              :     }
   11420              : 
   11421         8970 :   if (!insn_data[icode].operand[1].predicate (op0, mode0))
   11422         1399 :     op0 = copy_to_mode_reg (mode0, op0);
   11423         8970 :   if (!insn_data[icode].operand[2].predicate (op1, mode1))
   11424          817 :     op1 = copy_to_mode_reg (mode1, op1);
   11425              : 
   11426         8970 :   pat = GEN_FCN (icode) (target, op0, op1);
   11427         8970 :   if (! pat)
   11428              :     return 0;
   11429              : 
   11430         8970 :   emit_insn (pat);
   11431              : 
   11432         8970 :   return target;
   11433              : }
   11434              : 
   11435              : /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
   11436              : 
   11437              : static rtx
   11438         1792 : ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
   11439              :                                enum ix86_builtin_func_type m_type,
   11440              :                                enum rtx_code sub_code)
   11441              : {
   11442         1792 :   rtx pat;
   11443         1792 :   unsigned int i, nargs;
   11444         1792 :   bool comparison_p = false;
   11445         1792 :   bool tf_p = false;
   11446         1792 :   bool last_arg_constant = false;
   11447         1792 :   int num_memory = 0;
   11448         1792 :   rtx xops[4];
   11449              : 
   11450         1792 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11451              : 
   11452         1792 :   switch (m_type)
   11453              :     {
   11454              :     case MULTI_ARG_4_DF2_DI_I:
   11455              :     case MULTI_ARG_4_DF2_DI_I1:
   11456              :     case MULTI_ARG_4_SF2_SI_I:
   11457              :     case MULTI_ARG_4_SF2_SI_I1:
   11458              :       nargs = 4;
   11459              :       last_arg_constant = true;
   11460              :       break;
   11461              : 
   11462          821 :     case MULTI_ARG_3_SF:
   11463          821 :     case MULTI_ARG_3_DF:
   11464          821 :     case MULTI_ARG_3_SF2:
   11465          821 :     case MULTI_ARG_3_DF2:
   11466          821 :     case MULTI_ARG_3_DI:
   11467          821 :     case MULTI_ARG_3_SI:
   11468          821 :     case MULTI_ARG_3_SI_DI:
   11469          821 :     case MULTI_ARG_3_HI:
   11470          821 :     case MULTI_ARG_3_HI_SI:
   11471          821 :     case MULTI_ARG_3_QI:
   11472          821 :     case MULTI_ARG_3_DI2:
   11473          821 :     case MULTI_ARG_3_SI2:
   11474          821 :     case MULTI_ARG_3_HI2:
   11475          821 :     case MULTI_ARG_3_QI2:
   11476          821 :       nargs = 3;
   11477          821 :       break;
   11478              : 
   11479          128 :     case MULTI_ARG_2_SF:
   11480          128 :     case MULTI_ARG_2_DF:
   11481          128 :     case MULTI_ARG_2_DI:
   11482          128 :     case MULTI_ARG_2_SI:
   11483          128 :     case MULTI_ARG_2_HI:
   11484          128 :     case MULTI_ARG_2_QI:
   11485          128 :       nargs = 2;
   11486          128 :       break;
   11487              : 
   11488           64 :     case MULTI_ARG_2_DI_IMM:
   11489           64 :     case MULTI_ARG_2_SI_IMM:
   11490           64 :     case MULTI_ARG_2_HI_IMM:
   11491           64 :     case MULTI_ARG_2_QI_IMM:
   11492           64 :       nargs = 2;
   11493           64 :       last_arg_constant = true;
   11494           64 :       break;
   11495              : 
   11496          187 :     case MULTI_ARG_1_SF:
   11497          187 :     case MULTI_ARG_1_DF:
   11498          187 :     case MULTI_ARG_1_SF2:
   11499          187 :     case MULTI_ARG_1_DF2:
   11500          187 :     case MULTI_ARG_1_DI:
   11501          187 :     case MULTI_ARG_1_SI:
   11502          187 :     case MULTI_ARG_1_HI:
   11503          187 :     case MULTI_ARG_1_QI:
   11504          187 :     case MULTI_ARG_1_SI_DI:
   11505          187 :     case MULTI_ARG_1_HI_DI:
   11506          187 :     case MULTI_ARG_1_HI_SI:
   11507          187 :     case MULTI_ARG_1_QI_DI:
   11508          187 :     case MULTI_ARG_1_QI_SI:
   11509          187 :     case MULTI_ARG_1_QI_HI:
   11510          187 :       nargs = 1;
   11511          187 :       break;
   11512              : 
   11513          384 :     case MULTI_ARG_2_DI_CMP:
   11514          384 :     case MULTI_ARG_2_SI_CMP:
   11515          384 :     case MULTI_ARG_2_HI_CMP:
   11516          384 :     case MULTI_ARG_2_QI_CMP:
   11517          384 :       nargs = 2;
   11518          384 :       comparison_p = true;
   11519          384 :       break;
   11520              : 
   11521          128 :     case MULTI_ARG_2_SF_TF:
   11522          128 :     case MULTI_ARG_2_DF_TF:
   11523          128 :     case MULTI_ARG_2_DI_TF:
   11524          128 :     case MULTI_ARG_2_SI_TF:
   11525          128 :     case MULTI_ARG_2_HI_TF:
   11526          128 :     case MULTI_ARG_2_QI_TF:
   11527          128 :       nargs = 2;
   11528          128 :       tf_p = true;
   11529          128 :       break;
   11530              : 
   11531            0 :     default:
   11532            0 :       gcc_unreachable ();
   11533              :     }
   11534              : 
   11535          628 :   if (optimize || !target
   11536          628 :       || GET_MODE (target) != tmode
   11537         2396 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11538         1188 :     target = gen_reg_rtx (tmode);
   11539          604 :   else if (memory_operand (target, tmode))
   11540            0 :     num_memory++;
   11541              : 
   11542         1792 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   11543              : 
   11544         6162 :   for (i = 0; i < nargs; i++)
   11545              :     {
   11546         4378 :       tree arg = CALL_EXPR_ARG (exp, i);
   11547         4378 :       rtx op = expand_normal (arg);
   11548         4378 :       int adjust = (comparison_p) ? 1 : 0;
   11549         4378 :       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
   11550              : 
   11551         4378 :       if (last_arg_constant && i == nargs - 1)
   11552              :         {
   11553          144 :           if (!insn_data[icode].operand[i + 1].predicate (op, mode))
   11554              :             {
   11555           30 :               enum insn_code new_icode = icode;
   11556           30 :               switch (icode)
   11557              :                 {
   11558            8 :                 case CODE_FOR_xop_vpermil2v2df3:
   11559            8 :                 case CODE_FOR_xop_vpermil2v4sf3:
   11560            8 :                 case CODE_FOR_xop_vpermil2v4df3:
   11561            8 :                 case CODE_FOR_xop_vpermil2v8sf3:
   11562            8 :                   error ("the last argument must be a 2-bit immediate");
   11563            8 :                   return gen_reg_rtx (tmode);
   11564            5 :                 case CODE_FOR_xop_rotlv2di3:
   11565            5 :                   new_icode = CODE_FOR_rotlv2di3;
   11566            5 :                   goto xop_rotl;
   11567            5 :                 case CODE_FOR_xop_rotlv4si3:
   11568            5 :                   new_icode = CODE_FOR_rotlv4si3;
   11569            5 :                   goto xop_rotl;
   11570            6 :                 case CODE_FOR_xop_rotlv8hi3:
   11571            6 :                   new_icode = CODE_FOR_rotlv8hi3;
   11572            6 :                   goto xop_rotl;
   11573              :                 case CODE_FOR_xop_rotlv16qi3:
   11574              :                   new_icode = CODE_FOR_rotlv16qi3;
   11575           22 :                 xop_rotl:
   11576           22 :                   if (CONST_INT_P (op))
   11577              :                     {
   11578            6 :                       int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
   11579            6 :                       op = GEN_INT (INTVAL (op) & mask);
   11580            6 :                       gcc_checking_assert
   11581              :                         (insn_data[icode].operand[i + 1].predicate (op, mode));
   11582              :                     }
   11583              :                   else
   11584              :                     {
   11585           16 :                       gcc_checking_assert
   11586              :                         (nargs == 2
   11587              :                          && insn_data[new_icode].operand[0].mode == tmode
   11588              :                          && insn_data[new_icode].operand[1].mode == tmode
   11589              :                          && insn_data[new_icode].operand[2].mode == mode
   11590              :                          && insn_data[new_icode].operand[0].predicate
   11591              :                             == insn_data[icode].operand[0].predicate
   11592              :                          && insn_data[new_icode].operand[1].predicate
   11593              :                             == insn_data[icode].operand[1].predicate);
   11594           16 :                       icode = new_icode;
   11595           16 :                       goto non_constant;
   11596              :                     }
   11597              :                   break;
   11598            0 :                 default:
   11599            0 :                   gcc_unreachable ();
   11600              :                 }
   11601              :             }
   11602              :         }
   11603              :       else
   11604              :         {
   11605         4234 :         non_constant:
   11606         4250 :           if (VECTOR_MODE_P (mode))
   11607         4234 :             op = safe_vector_operand (op, mode);
   11608              : 
   11609              :           /* If we aren't optimizing, only allow one memory operand to be
   11610              :              generated.  */
   11611         4250 :           if (memory_operand (op, mode))
   11612          826 :             num_memory++;
   11613              : 
   11614         4250 :           gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
   11615              : 
   11616         4250 :           if (optimize
   11617         1506 :               || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
   11618         5678 :               || num_memory > 1)
   11619         3329 :             op = force_reg (mode, op);
   11620              :         }
   11621              : 
   11622         4370 :       xops[i] = op;
   11623              :     }
   11624              : 
   11625         1784 :   switch (nargs)
   11626              :     {
   11627          187 :     case 1:
   11628          187 :       pat = GEN_FCN (icode) (target, xops[0]);
   11629          187 :       break;
   11630              : 
   11631          704 :     case 2:
   11632          704 :       if (tf_p)
   11633          128 :         pat = GEN_FCN (icode) (target, xops[0], xops[1],
   11634          128 :                                GEN_INT ((int)sub_code));
   11635          576 :       else if (! comparison_p)
   11636          192 :         pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   11637              :       else
   11638              :         {
   11639          384 :           rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
   11640              :                                        xops[0], xops[1]);
   11641              : 
   11642          384 :           pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
   11643              :         }
   11644              :       break;
   11645              : 
   11646          821 :     case 3:
   11647          821 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   11648          821 :       break;
   11649              : 
   11650           72 :     case 4:
   11651           72 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
   11652           72 :       break;
   11653              : 
   11654              :     default:
   11655              :       gcc_unreachable ();
   11656              :     }
   11657              : 
   11658         1784 :   if (! pat)
   11659              :     return 0;
   11660              : 
   11661         1784 :   emit_insn (pat);
   11662         1784 :   return target;
   11663              : }
   11664              : 
   11665              : /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
   11666              :    insns with vec_merge.  */
   11667              : 
   11668              : static rtx
   11669           52 : ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
   11670              :                                     rtx target)
   11671              : {
   11672           52 :   rtx pat;
   11673           52 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11674           52 :   rtx op1, op0 = expand_normal (arg0);
   11675           52 :   machine_mode tmode = insn_data[icode].operand[0].mode;
   11676           52 :   machine_mode mode0 = insn_data[icode].operand[1].mode;
   11677              : 
   11678           16 :   if (optimize || !target
   11679           16 :       || GET_MODE (target) != tmode
   11680           68 :       || !insn_data[icode].operand[0].predicate (target, tmode))
   11681           36 :     target = gen_reg_rtx (tmode);
   11682              : 
   11683           52 :   if (VECTOR_MODE_P (mode0))
   11684           52 :     op0 = safe_vector_operand (op0, mode0);
   11685              : 
   11686           36 :   if ((optimize && !register_operand (op0, mode0))
   11687           88 :       || !insn_data[icode].operand[1].predicate (op0, mode0))
   11688            0 :     op0 = copy_to_mode_reg (mode0, op0);
   11689              : 
   11690           52 :   op1 = op0;
   11691           52 :   if (!insn_data[icode].operand[2].predicate (op1, mode0))
   11692           16 :     op1 = copy_to_mode_reg (mode0, op1);
   11693              : 
   11694           52 :   pat = GEN_FCN (icode) (target, op0, op1);
   11695           52 :   if (! pat)
   11696              :     return 0;
   11697           52 :   emit_insn (pat);
   11698           52 :   return target;
   11699              : }
   11700              : 
   11701              : /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
   11702              : 
   11703              : static rtx
   11704          614 : ix86_expand_sse_compare (const struct builtin_description *d,
   11705              :                          tree exp, rtx target, bool swap)
   11706              : {
   11707          614 :   rtx pat;
   11708          614 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11709          614 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11710          614 :   rtx op0 = expand_normal (arg0);
   11711          614 :   rtx op1 = expand_normal (arg1);
   11712          614 :   rtx op2;
   11713          614 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11714          614 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11715          614 :   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
   11716          614 :   enum rtx_code comparison = d->comparison;
   11717              : 
   11718          614 :   if (VECTOR_MODE_P (mode0))
   11719          614 :     op0 = safe_vector_operand (op0, mode0);
   11720          614 :   if (VECTOR_MODE_P (mode1))
   11721          614 :     op1 = safe_vector_operand (op1, mode1);
   11722              : 
   11723              :   /* Swap operands if we have a comparison that isn't available in
   11724              :      hardware.  */
   11725          614 :   if (swap)
   11726           80 :     std::swap (op0, op1);
   11727              : 
   11728          202 :   if (optimize || !target
   11729          202 :       || GET_MODE (target) != tmode
   11730          816 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11731          412 :     target = gen_reg_rtx (tmode);
   11732              : 
   11733          412 :   if ((optimize && !register_operand (op0, mode0))
   11734          956 :       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
   11735          272 :     op0 = copy_to_mode_reg (mode0, op0);
   11736          412 :   if ((optimize && !register_operand (op1, mode1))
   11737          972 :       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
   11738           54 :     op1 = copy_to_mode_reg (mode1, op1);
   11739              : 
   11740          614 :   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
   11741          614 :   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
   11742          614 :   if (! pat)
   11743              :     return 0;
   11744          614 :   emit_insn (pat);
   11745          614 :   return target;
   11746              : }
   11747              : 
   11748              : /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
   11749              :  * ordered EQ or unordered NE, generate PF jump.  */
   11750              : 
   11751              : static rtx
   11752          644 : ix86_ssecom_setcc (const enum rtx_code comparison,
   11753              :                    bool check_unordered, machine_mode mode,
   11754              :                    rtx set_dst, rtx target)
   11755              : {
   11756              : 
   11757          644 :   rtx_code_label *label = NULL;
   11758              : 
   11759              :   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
   11760              :      with NAN operands.
   11761              :      Under TARGET_AVX10_2, VCOMX/VUCOMX are generated instead of
   11762              :      COMI/UCOMI.  VCOMX/VUCOMX will not set ZF for NAN operands.  */
   11763          644 :   if (check_unordered)
   11764              :     {
   11765          122 :       gcc_assert (comparison == EQ || comparison == NE);
   11766              : 
   11767          122 :       rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
   11768          122 :       label = gen_label_rtx ();
   11769          122 :       rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
   11770          122 :       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   11771              :                                   gen_rtx_LABEL_REF (VOIDmode, label),
   11772              :                                   pc_rtx);
   11773          122 :       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   11774              :     }
   11775              : 
   11776              :   /* NB: Set CCFPmode and check a different CCmode which is in subset
   11777              :      of CCFPmode.  */
   11778          644 :   if (GET_MODE (set_dst) != mode)
   11779              :     {
   11780          200 :       gcc_assert (mode == CCAmode || mode == CCCmode
   11781              :                   || mode == CCOmode || mode == CCPmode
   11782              :                   || mode == CCSmode || mode == CCZmode);
   11783          200 :       set_dst = gen_rtx_REG (mode, FLAGS_REG);
   11784              :     }
   11785              : 
   11786          644 :   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   11787              :                           gen_rtx_fmt_ee (comparison, QImode,
   11788              :                                           set_dst,
   11789              :                                           const0_rtx)));
   11790              : 
   11791          644 :   if (label)
   11792          122 :     emit_label (label);
   11793              : 
   11794          644 :   return SUBREG_REG (target);
   11795              : }
   11796              : 
   11797              : /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
   11798              : 
   11799              : static rtx
   11800          545 : ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
   11801              :                       rtx target, bool comx_ok)
   11802              : {
   11803          545 :   rtx pat, set_dst;
   11804          545 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11805          545 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11806          545 :   rtx op0 = expand_normal (arg0);
   11807          545 :   rtx op1 = expand_normal (arg1);
   11808          545 :   enum insn_code icode = d->icode;
   11809          545 :   const struct insn_data_d *insn_p = &insn_data[icode];
   11810          545 :   machine_mode mode0 = insn_p->operand[0].mode;
   11811          545 :   machine_mode mode1 = insn_p->operand[1].mode;
   11812              : 
   11813          545 :   if (VECTOR_MODE_P (mode0))
   11814          545 :     op0 = safe_vector_operand (op0, mode0);
   11815          545 :   if (VECTOR_MODE_P (mode1))
   11816          545 :     op1 = safe_vector_operand (op1, mode1);
   11817              : 
   11818          545 :   enum rtx_code comparison = d->comparison;
   11819          545 :   rtx const_val = const0_rtx;
   11820              : 
   11821          545 :   bool check_unordered = false;
   11822          545 :   machine_mode mode = CCFPmode;
   11823          545 :   switch (comparison)
   11824              :     {
   11825          192 :     case LE:    /* -> GE  */
   11826          192 :     case LT:    /* -> GT  */
   11827          192 :       std::swap (op0, op1);
   11828          192 :       comparison = swap_condition (comparison);
   11829              :       /* FALLTHRU */
   11830              :     case GT:
   11831              :     case GE:
   11832              :       break;
   11833           73 :     case EQ:
   11834           73 :       if (!TARGET_AVX10_2 || !comx_ok)
   11835           45 :         check_unordered = true;
   11836              :       mode = CCZmode;
   11837              :       break;
   11838           96 :     case NE:
   11839           96 :       if (!TARGET_AVX10_2 || !comx_ok)
   11840           68 :         check_unordered = true;
   11841           96 :       mode = CCZmode;
   11842           96 :       const_val = const1_rtx;
   11843           96 :       break;
   11844            0 :     default:
   11845            0 :       gcc_unreachable ();
   11846              :     }
   11847              : 
   11848          545 :   target = gen_reg_rtx (SImode);
   11849          545 :   emit_move_insn (target, const_val);
   11850          545 :   target = gen_rtx_SUBREG (QImode, target, 0);
   11851              : 
   11852          424 :   if ((optimize && !register_operand (op0, mode0))
   11853          921 :       || !insn_p->operand[0].predicate (op0, mode0))
   11854          169 :     op0 = copy_to_mode_reg (mode0, op0);
   11855          424 :   if ((optimize && !register_operand (op1, mode1))
   11856          921 :       || !insn_p->operand[1].predicate (op1, mode1))
   11857           48 :     op1 = copy_to_mode_reg (mode1, op1);
   11858              : 
   11859          545 :   if ((comparison == EQ || comparison == NE)
   11860          169 :       && TARGET_AVX10_2 && comx_ok)
   11861              :     {
   11862           56 :       switch (icode)
   11863              :         {
   11864              :         case CODE_FOR_sse_comi:
   11865              :           icode = CODE_FOR_avx10_2_comxsf;
   11866              :           break;
   11867           14 :         case CODE_FOR_sse_ucomi:
   11868           14 :           icode = CODE_FOR_avx10_2_ucomxsf;
   11869           14 :           break;
   11870           14 :         case CODE_FOR_sse2_comi:
   11871           14 :           icode = CODE_FOR_avx10_2_comxdf;
   11872           14 :           break;
   11873           14 :         case CODE_FOR_sse2_ucomi:
   11874           14 :           icode = CODE_FOR_avx10_2_ucomxdf;
   11875           14 :           break;
   11876              : 
   11877            0 :         default:
   11878            0 :           gcc_unreachable ();
   11879              :         }
   11880              :     }
   11881          545 :   pat = GEN_FCN (icode) (op0, op1);
   11882          545 :   if (! pat)
   11883              :     return 0;
   11884              : 
   11885          545 :   set_dst = SET_DEST (pat);
   11886          545 :   emit_insn (pat);
   11887          545 :   return ix86_ssecom_setcc (comparison, check_unordered, mode,
   11888          545 :                             set_dst, target);
   11889              : }
   11890              : 
   11891              : /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
   11892              : 
   11893              : static rtx
   11894            0 : ix86_expand_sse_round (const struct builtin_description *d, tree exp,
   11895              :                        rtx target)
   11896              : {
   11897            0 :   rtx pat;
   11898            0 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11899            0 :   rtx op1, op0 = expand_normal (arg0);
   11900            0 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11901            0 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11902              : 
   11903            0 :   if (optimize || target == 0
   11904            0 :       || GET_MODE (target) != tmode
   11905            0 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11906            0 :     target = gen_reg_rtx (tmode);
   11907              : 
   11908            0 :   if (VECTOR_MODE_P (mode0))
   11909            0 :     op0 = safe_vector_operand (op0, mode0);
   11910              : 
   11911            0 :   if ((optimize && !register_operand (op0, mode0))
   11912            0 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   11913            0 :     op0 = copy_to_mode_reg (mode0, op0);
   11914              : 
   11915            0 :   op1 = GEN_INT (d->comparison);
   11916              : 
   11917            0 :   pat = GEN_FCN (d->icode) (target, op0, op1);
   11918            0 :   if (! pat)
   11919              :     return 0;
   11920            0 :   emit_insn (pat);
   11921            0 :   return target;
   11922              : }
   11923              : 
   11924              : static rtx
   11925           12 : ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
   11926              :                                      tree exp, rtx target)
   11927              : {
   11928           12 :   rtx pat;
   11929           12 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11930           12 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11931           12 :   rtx op0 = expand_normal (arg0);
   11932           12 :   rtx op1 = expand_normal (arg1);
   11933           12 :   rtx op2;
   11934           12 :   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   11935           12 :   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   11936           12 :   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
   11937              : 
   11938            0 :   if (optimize || target == 0
   11939            0 :       || GET_MODE (target) != tmode
   11940           12 :       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   11941           12 :     target = gen_reg_rtx (tmode);
   11942              : 
   11943           12 :   op0 = safe_vector_operand (op0, mode0);
   11944           12 :   op1 = safe_vector_operand (op1, mode1);
   11945              : 
   11946           12 :   if ((optimize && !register_operand (op0, mode0))
   11947           12 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   11948           12 :     op0 = copy_to_mode_reg (mode0, op0);
   11949           12 :   if ((optimize && !register_operand (op1, mode1))
   11950           12 :       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
   11951           12 :     op1 = copy_to_mode_reg (mode1, op1);
   11952              : 
   11953           12 :   op2 = GEN_INT (d->comparison);
   11954              : 
   11955           12 :   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
   11956           12 :   if (! pat)
   11957              :     return 0;
   11958           12 :   emit_insn (pat);
   11959           12 :   return target;
   11960              : }
   11961              : 
   11962              : /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
   11963              : 
   11964              : static rtx
   11965          235 : ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
   11966              :                        rtx target)
   11967              : {
   11968          235 :   rtx pat;
   11969          235 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11970          235 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11971          235 :   rtx op0 = expand_normal (arg0);
   11972          235 :   rtx op1 = expand_normal (arg1);
   11973          235 :   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
   11974          235 :   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   11975          235 :   enum rtx_code comparison = d->comparison;
   11976              : 
   11977              :   /* ptest reg, reg sets the carry flag.  */
   11978          235 :   if (comparison == LTU
   11979           75 :       && (d->code == IX86_BUILTIN_PTESTC
   11980           57 :           || d->code == IX86_BUILTIN_PTESTC256)
   11981          266 :       && rtx_equal_p (op0, op1))
   11982              :     {
   11983            2 :       if (!target)
   11984            0 :         target = gen_reg_rtx (SImode);
   11985            2 :       emit_move_insn (target, const1_rtx);
   11986            2 :       return target;
   11987              :     }
   11988              : 
   11989          233 :   if (VECTOR_MODE_P (mode0))
   11990          233 :     op0 = safe_vector_operand (op0, mode0);
   11991          233 :   if (VECTOR_MODE_P (mode1))
   11992          233 :     op1 = safe_vector_operand (op1, mode1);
   11993              : 
   11994          233 :   target = gen_reg_rtx (SImode);
   11995          233 :   emit_move_insn (target, const0_rtx);
   11996          233 :   target = gen_rtx_SUBREG (QImode, target, 0);
   11997              : 
   11998          161 :   if ((optimize && !register_operand (op0, mode0))
   11999          366 :       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   12000          100 :     op0 = copy_to_mode_reg (mode0, op0);
   12001          161 :   if ((optimize && !register_operand (op1, mode1))
   12002          367 :       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
   12003           27 :     op1 = copy_to_mode_reg (mode1, op1);
   12004              : 
   12005          233 :   pat = GEN_FCN (d->icode) (op0, op1);
   12006          233 :   if (! pat)
   12007              :     return 0;
   12008          233 :   emit_insn (pat);
   12009          233 :   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12010              :                           gen_rtx_fmt_ee (comparison, QImode,
   12011              :                                           SET_DEST (pat),
   12012              :                                           const0_rtx)));
   12013              : 
   12014          233 :   return SUBREG_REG (target);
   12015              : }
   12016              : 
   12017              : /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
   12018              : 
   12019              : static rtx
   12020          216 : ix86_expand_sse_pcmpestr (const struct builtin_description *d,
   12021              :                           tree exp, rtx target)
   12022              : {
   12023          216 :   rtx pat;
   12024          216 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   12025          216 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   12026          216 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   12027          216 :   tree arg3 = CALL_EXPR_ARG (exp, 3);
   12028          216 :   tree arg4 = CALL_EXPR_ARG (exp, 4);
   12029          216 :   rtx scratch0, scratch1;
   12030          216 :   rtx op0 = expand_normal (arg0);
   12031          216 :   rtx op1 = expand_normal (arg1);
   12032          216 :   rtx op2 = expand_normal (arg2);
   12033          216 :   rtx op3 = expand_normal (arg3);
   12034          216 :   rtx op4 = expand_normal (arg4);
   12035          216 :   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
   12036              : 
   12037          216 :   tmode0 = insn_data[d->icode].operand[0].mode;
   12038          216 :   tmode1 = insn_data[d->icode].operand[1].mode;
   12039          216 :   modev2 = insn_data[d->icode].operand[2].mode;
   12040          216 :   modei3 = insn_data[d->icode].operand[3].mode;
   12041          216 :   modev4 = insn_data[d->icode].operand[4].mode;
   12042          216 :   modei5 = insn_data[d->icode].operand[5].mode;
   12043          216 :   modeimm = insn_data[d->icode].operand[6].mode;
   12044              : 
   12045          216 :   if (VECTOR_MODE_P (modev2))
   12046          216 :     op0 = safe_vector_operand (op0, modev2);
   12047          216 :   if (VECTOR_MODE_P (modev4))
   12048          216 :     op2 = safe_vector_operand (op2, modev4);
   12049              : 
   12050          216 :   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
   12051            6 :     op0 = copy_to_mode_reg (modev2, op0);
   12052          216 :   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
   12053           34 :     op1 = copy_to_mode_reg (modei3, op1);
   12054          160 :   if ((optimize && !register_operand (op2, modev4))
   12055          371 :       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
   12056            5 :     op2 = copy_to_mode_reg (modev4, op2);
   12057          216 :   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
   12058           34 :     op3 = copy_to_mode_reg (modei5, op3);
   12059              : 
   12060          216 :   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
   12061              :     {
   12062           21 :       error ("the fifth argument must be an 8-bit immediate");
   12063           21 :       return const0_rtx;
   12064              :     }
   12065              : 
   12066          195 :   if (d->code == IX86_BUILTIN_PCMPESTRI128)
   12067              :     {
   12068            5 :       if (optimize || !target
   12069            5 :           || GET_MODE (target) != tmode0
   12070           34 :           || !insn_data[d->icode].operand[0].predicate (target, tmode0))
   12071           24 :         target = gen_reg_rtx (tmode0);
   12072              : 
   12073           29 :       scratch1 = gen_reg_rtx (tmode1);
   12074              : 
   12075           29 :       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
   12076              :     }
   12077          166 :   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
   12078              :     {
   12079            5 :       if (optimize || !target
   12080            5 :           || GET_MODE (target) != tmode1
   12081           36 :           || !insn_data[d->icode].operand[1].predicate (target, tmode1))
   12082           26 :         target = gen_reg_rtx (tmode1);
   12083              : 
   12084           31 :       scratch0 = gen_reg_rtx (tmode0);
   12085              : 
   12086           31 :       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
   12087              :     }
   12088              :   else
   12089              :     {
   12090          135 :       gcc_assert (d->flag);
   12091              : 
   12092          135 :       scratch0 = gen_reg_rtx (tmode0);
   12093          135 :       scratch1 = gen_reg_rtx (tmode1);
   12094              : 
   12095          135 :       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
   12096              :     }
   12097              : 
   12098          195 :   if (! pat)
   12099              :     return 0;
   12100              : 
   12101          195 :   emit_insn (pat);
   12102              : 
   12103          195 :   if (d->flag)
   12104              :     {
   12105          135 :       target = gen_reg_rtx (SImode);
   12106          135 :       emit_move_insn (target, const0_rtx);
   12107          135 :       target = gen_rtx_SUBREG (QImode, target, 0);
   12108              : 
   12109          135 :       emit_insn
   12110          135 :         (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12111              :                       gen_rtx_fmt_ee (EQ, QImode,
   12112              :                                       gen_rtx_REG ((machine_mode) d->flag,
   12113              :                                                    FLAGS_REG),
   12114              :                                       const0_rtx)));
   12115          135 :       return SUBREG_REG (target);
   12116              :     }
   12117              :   else
   12118              :     return target;
   12119              : }
   12120              : 
   12121              : 
   12122              : /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
   12123              : 
   12124              : static rtx
   12125          275 : ix86_expand_sse_pcmpistr (const struct builtin_description *d,
   12126              :                           tree exp, rtx target)
   12127              : {
   12128          275 :   rtx pat;
   12129          275 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   12130          275 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   12131          275 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   12132          275 :   rtx scratch0, scratch1;
   12133          275 :   rtx op0 = expand_normal (arg0);
   12134          275 :   rtx op1 = expand_normal (arg1);
   12135          275 :   rtx op2 = expand_normal (arg2);
   12136          275 :   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
   12137              : 
   12138          275 :   tmode0 = insn_data[d->icode].operand[0].mode;
   12139          275 :   tmode1 = insn_data[d->icode].operand[1].mode;
   12140          275 :   modev2 = insn_data[d->icode].operand[2].mode;
   12141          275 :   modev3 = insn_data[d->icode].operand[3].mode;
   12142          275 :   modeimm = insn_data[d->icode].operand[4].mode;
   12143              : 
   12144          275 :   if (VECTOR_MODE_P (modev2))
   12145          275 :     op0 = safe_vector_operand (op0, modev2);
   12146          275 :   if (VECTOR_MODE_P (modev3))
   12147          275 :     op1 = safe_vector_operand (op1, modev3);
   12148              : 
   12149          275 :   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
   12150            4 :     op0 = copy_to_mode_reg (modev2, op0);
   12151          210 :   if ((optimize && !register_operand (op1, modev3))
   12152          481 :       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
   12153            4 :     op1 = copy_to_mode_reg (modev3, op1);
   12154              : 
   12155          275 :   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
   12156              :     {
   12157           21 :       error ("the third argument must be an 8-bit immediate");
   12158           21 :       return const0_rtx;
   12159              :     }
   12160              : 
   12161          254 :   if (d->code == IX86_BUILTIN_PCMPISTRI128)
   12162              :     {
   12163            5 :       if (optimize || !target
   12164            5 :           || GET_MODE (target) != tmode0
   12165           38 :           || !insn_data[d->icode].operand[0].predicate (target, tmode0))
   12166           28 :         target = gen_reg_rtx (tmode0);
   12167              : 
   12168           33 :       scratch1 = gen_reg_rtx (tmode1);
   12169              : 
   12170           33 :       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
   12171              :     }
   12172          221 :   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
   12173              :     {
   12174            8 :       if (optimize || !target
   12175            8 :           || GET_MODE (target) != tmode1
   12176           58 :           || !insn_data[d->icode].operand[1].predicate (target, tmode1))
   12177           42 :         target = gen_reg_rtx (tmode1);
   12178              : 
   12179           50 :       scratch0 = gen_reg_rtx (tmode0);
   12180              : 
   12181           50 :       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
   12182              :     }
   12183              :   else
   12184              :     {
   12185          171 :       gcc_assert (d->flag);
   12186              : 
   12187          171 :       scratch0 = gen_reg_rtx (tmode0);
   12188          171 :       scratch1 = gen_reg_rtx (tmode1);
   12189              : 
   12190          171 :       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
   12191              :     }
   12192              : 
   12193          254 :   if (! pat)
   12194              :     return 0;
   12195              : 
   12196          254 :   emit_insn (pat);
   12197              : 
   12198          254 :   if (d->flag)
   12199              :     {
   12200          171 :       target = gen_reg_rtx (SImode);
   12201          171 :       emit_move_insn (target, const0_rtx);
   12202          171 :       target = gen_rtx_SUBREG (QImode, target, 0);
   12203              : 
   12204          171 :       emit_insn
   12205          171 :         (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   12206              :                       gen_rtx_fmt_ee (EQ, QImode,
   12207              :                                       gen_rtx_REG ((machine_mode) d->flag,
   12208              :                                                    FLAGS_REG),
   12209              :                                       const0_rtx)));
   12210          171 :       return SUBREG_REG (target);
   12211              :     }
   12212              :   else
   12213              :     return target;
   12214              : }
   12215              : 
   12216              : /* Fixup modeless constants to fit required mode.  */
   12217              : 
   12218              : static rtx
   12219       258822 : fixup_modeless_constant (rtx x, machine_mode mode)
   12220              : {
   12221       258822 :   if (GET_MODE (x) == VOIDmode)
   12222        41227 :     x = convert_to_mode (mode, x, 1);
   12223       258822 :   return x;
   12224              : }
   12225              : 
   12226              : /* Expand the outgoing argument ARG to extract unsigned char and short
   12227              :    integer constants suitable for the predicates and the instruction
   12228              :    templates which expect the unsigned expanded value.  */
   12229              : 
   12230              : static rtx
   12231       280000 : ix86_expand_unsigned_small_int_cst_argument (tree arg)
   12232              : {
   12233              :   /* When passing 0xff as an unsigned char function argument with the
   12234              :      C frontend promotion, expand_normal gets
   12235              : 
   12236              :      <integer_cst 0x7fffe6aa23a8 type <integer_type 0x7fffe98225e8 int> constant 255>
   12237              : 
   12238              :      and returns the rtx value using the sign-extended representation:
   12239              : 
   12240              :      (const_int 255 [0xff])
   12241              : 
   12242              :      Without the C frontend promotion, expand_normal gets
   12243              : 
   12244              :      <integer_cst 0x7fffe9824018 type <integer_type 0x7fffe9822348 unsigned char > constant 255>
   12245              : 
   12246              :      and returns
   12247              : 
   12248              :      (const_int -1 [0xffffffffffffffff])
   12249              : 
   12250              :      which doesn't work with the predicates nor the instruction templates
   12251              :      which expect the unsigned expanded value.  Extract the unsigned char
   12252              :      and short integer constants to return
   12253              : 
   12254              :      (const_int 255 [0xff])
   12255              : 
   12256              :      so that the expanded value is always unsigned, without the C frontend
   12257              :      promotion.  */
   12258              : 
   12259       280000 :   if (TREE_CODE (arg) == INTEGER_CST)
   12260              :     {
   12261        60052 :       tree type = TREE_TYPE (arg);
   12262        60052 :       if (INTEGRAL_TYPE_P (type)
   12263        60052 :           && TYPE_UNSIGNED (type)
   12264        81834 :           && TYPE_PRECISION (type) < TYPE_PRECISION (integer_type_node))
   12265              :         {
   12266        18298 :           HOST_WIDE_INT cst = TREE_INT_CST_LOW (arg);
   12267        18298 :           return GEN_INT (cst);
   12268              :         }
   12269              :     }
   12270              : 
   12271       261702 :   return expand_normal (arg);
   12272              : }
   12273              : 
   12274              : /* Subroutine of ix86_expand_builtin to take care of insns with
   12275              :    variable number of operands.  */
   12276              : 
   12277              : static rtx
   12278        69393 : ix86_expand_args_builtin (const struct builtin_description *d,
   12279              :                           tree exp, rtx target)
   12280              : {
   12281        69393 :   rtx pat, real_target;
   12282        69393 :   unsigned int i, nargs;
   12283        69393 :   unsigned int nargs_constant = 0;
   12284        69393 :   unsigned int mask_pos = 0;
   12285        69393 :   int num_memory = 0;
   12286        69393 :   rtx xops[6];
   12287        69393 :   bool second_arg_count = false;
   12288        69393 :   enum insn_code icode = d->icode;
   12289        69393 :   const struct insn_data_d *insn_p = &insn_data[icode];
   12290        69393 :   machine_mode tmode = insn_p->operand[0].mode;
   12291        69393 :   machine_mode rmode = VOIDmode;
   12292        69393 :   bool swap = false;
   12293        69393 :   enum rtx_code comparison = d->comparison;
   12294              : 
   12295        69393 :   switch ((enum ix86_builtin_func_type) d->flag)
   12296              :     {
   12297            0 :     case V2DF_FTYPE_V2DF_ROUND:
   12298            0 :     case V4DF_FTYPE_V4DF_ROUND:
   12299            0 :     case V8DF_FTYPE_V8DF_ROUND:
   12300            0 :     case V4SF_FTYPE_V4SF_ROUND:
   12301            0 :     case V8SF_FTYPE_V8SF_ROUND:
   12302            0 :     case V16SF_FTYPE_V16SF_ROUND:
   12303            0 :     case V8HF_FTYPE_V8HF_ROUND:
   12304            0 :     case V16HF_FTYPE_V16HF_ROUND:
   12305            0 :     case V32HF_FTYPE_V32HF_ROUND:
   12306            0 :     case V4SI_FTYPE_V4SF_ROUND:
   12307            0 :     case V8SI_FTYPE_V8SF_ROUND:
   12308            0 :     case V16SI_FTYPE_V16SF_ROUND:
   12309            0 :       return ix86_expand_sse_round (d, exp, target);
   12310           12 :     case V4SI_FTYPE_V2DF_V2DF_ROUND:
   12311           12 :     case V8SI_FTYPE_V4DF_V4DF_ROUND:
   12312           12 :     case V16SI_FTYPE_V8DF_V8DF_ROUND:
   12313           12 :       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
   12314          235 :     case INT_FTYPE_V8SF_V8SF_PTEST:
   12315          235 :     case INT_FTYPE_V4DI_V4DI_PTEST:
   12316          235 :     case INT_FTYPE_V4DF_V4DF_PTEST:
   12317          235 :     case INT_FTYPE_V4SF_V4SF_PTEST:
   12318          235 :     case INT_FTYPE_V2DI_V2DI_PTEST:
   12319          235 :     case INT_FTYPE_V2DF_V2DF_PTEST:
   12320          235 :       return ix86_expand_sse_ptest (d, exp, target);
   12321              :     case FLOAT128_FTYPE_FLOAT128:
   12322              :     case FLOAT_FTYPE_FLOAT:
   12323              :     case FLOAT_FTYPE_BFLOAT16:
   12324              :     case INT_FTYPE_INT:
   12325              :     case UINT_FTYPE_UINT:
   12326              :     case UINT16_FTYPE_UINT16:
   12327              :     case UINT64_FTYPE_INT:
   12328              :     case UINT64_FTYPE_UINT64:
   12329              :     case INT64_FTYPE_INT64:
   12330              :     case INT64_FTYPE_V4SF:
   12331              :     case INT64_FTYPE_V2DF:
   12332              :     case INT_FTYPE_V16QI:
   12333              :     case INT_FTYPE_V8QI:
   12334              :     case INT_FTYPE_V8SF:
   12335              :     case INT_FTYPE_V4DF:
   12336              :     case INT_FTYPE_V4SF:
   12337              :     case INT_FTYPE_V2DF:
   12338              :     case INT_FTYPE_V32QI:
   12339              :     case V16QI_FTYPE_V16QI:
   12340              :     case V8SI_FTYPE_V8SF:
   12341              :     case V8SI_FTYPE_V4SI:
   12342              :     case V8HI_FTYPE_V8HI:
   12343              :     case V8HI_FTYPE_V16QI:
   12344              :     case V8QI_FTYPE_V8QI:
   12345              :     case V8SF_FTYPE_V8SF:
   12346              :     case V8SF_FTYPE_V8SI:
   12347              :     case V8SF_FTYPE_V4SF:
   12348              :     case V8SF_FTYPE_V8HI:
   12349              :     case V4SI_FTYPE_V4SI:
   12350              :     case V4SI_FTYPE_V16QI:
   12351              :     case V4SI_FTYPE_V4SF:
   12352              :     case V4SI_FTYPE_V8SI:
   12353              :     case V4SI_FTYPE_V8HI:
   12354              :     case V4SI_FTYPE_V4DF:
   12355              :     case V4SI_FTYPE_V2DF:
   12356              :     case V4HI_FTYPE_V4HI:
   12357              :     case V4DF_FTYPE_V4DF:
   12358              :     case V4DF_FTYPE_V4SI:
   12359              :     case V4DF_FTYPE_V4SF:
   12360              :     case V4DF_FTYPE_V2DF:
   12361              :     case V4SF_FTYPE_V4SF:
   12362              :     case V4SF_FTYPE_V4SI:
   12363              :     case V4SF_FTYPE_V8SF:
   12364              :     case V4SF_FTYPE_V4DF:
   12365              :     case V4SF_FTYPE_V8HI:
   12366              :     case V4SF_FTYPE_V2DF:
   12367              :     case V2DI_FTYPE_V2DI:
   12368              :     case V2DI_FTYPE_V16QI:
   12369              :     case V2DI_FTYPE_V8HI:
   12370              :     case V2DI_FTYPE_V4SI:
   12371              :     case V2DF_FTYPE_V2DF:
   12372              :     case V2DF_FTYPE_V4SI:
   12373              :     case V2DF_FTYPE_V4DF:
   12374              :     case V2DF_FTYPE_V4SF:
   12375              :     case V2DF_FTYPE_V2SI:
   12376              :     case V2SI_FTYPE_V2SI:
   12377              :     case V2SI_FTYPE_V4SF:
   12378              :     case V2SI_FTYPE_V2SF:
   12379              :     case V2SI_FTYPE_V2DF:
   12380              :     case V2SF_FTYPE_V2SF:
   12381              :     case V2SF_FTYPE_V2SI:
   12382              :     case V32QI_FTYPE_V32QI:
   12383              :     case V32QI_FTYPE_V16QI:
   12384              :     case V16HI_FTYPE_V16HI:
   12385              :     case V16HI_FTYPE_V8HI:
   12386              :     case V8SI_FTYPE_V8SI:
   12387              :     case V16HI_FTYPE_V16QI:
   12388              :     case V8SI_FTYPE_V16QI:
   12389              :     case V4DI_FTYPE_V16QI:
   12390              :     case V8SI_FTYPE_V8HI:
   12391              :     case V4DI_FTYPE_V8HI:
   12392              :     case V4DI_FTYPE_V4SI:
   12393              :     case V4DI_FTYPE_V2DI:
   12394              :     case UQI_FTYPE_UQI:
   12395              :     case UHI_FTYPE_UHI:
   12396              :     case USI_FTYPE_USI:
   12397              :     case USI_FTYPE_UQI:
   12398              :     case USI_FTYPE_UHI:
   12399              :     case UDI_FTYPE_UDI:
   12400              :     case UHI_FTYPE_V16QI:
   12401              :     case USI_FTYPE_V32QI:
   12402              :     case UDI_FTYPE_V64QI:
   12403              :     case V16QI_FTYPE_UHI:
   12404              :     case V32QI_FTYPE_USI:
   12405              :     case V64QI_FTYPE_UDI:
   12406              :     case V8HI_FTYPE_UQI:
   12407              :     case V16HI_FTYPE_UHI:
   12408              :     case V32HI_FTYPE_USI:
   12409              :     case V4SI_FTYPE_UQI:
   12410              :     case V8SI_FTYPE_UQI:
   12411              :     case V4SI_FTYPE_UHI:
   12412              :     case V8SI_FTYPE_UHI:
   12413              :     case UQI_FTYPE_V8HI:
   12414              :     case UHI_FTYPE_V16HI:
   12415              :     case USI_FTYPE_V32HI:
   12416              :     case UQI_FTYPE_V4SI:
   12417              :     case UQI_FTYPE_V8SI:
   12418              :     case UHI_FTYPE_V16SI:
   12419              :     case UQI_FTYPE_V2DI:
   12420              :     case UQI_FTYPE_V4DI:
   12421              :     case UQI_FTYPE_V8DI:
   12422              :     case V16SI_FTYPE_UHI:
   12423              :     case V2DI_FTYPE_UQI:
   12424              :     case V4DI_FTYPE_UQI:
   12425              :     case V16SI_FTYPE_INT:
   12426              :     case V16SF_FTYPE_V8SF:
   12427              :     case V16SI_FTYPE_V8SI:
   12428              :     case V16SF_FTYPE_V4SF:
   12429              :     case V16SI_FTYPE_V4SI:
   12430              :     case V16SI_FTYPE_V16SF:
   12431              :     case V16SI_FTYPE_V16SI:
   12432              :     case V64QI_FTYPE_V64QI:
   12433              :     case V32HI_FTYPE_V32HI:
   12434              :     case V16SF_FTYPE_V16SF:
   12435              :     case V8DI_FTYPE_UQI:
   12436              :     case V8DI_FTYPE_V8DI:
   12437              :     case V8DF_FTYPE_V4DF:
   12438              :     case V8DF_FTYPE_V2DF:
   12439              :     case V8DF_FTYPE_V8DF:
   12440              :     case V4DI_FTYPE_V4DI:
   12441              :     case V16BF_FTYPE_V16SF:
   12442              :     case V8BF_FTYPE_V8SF:
   12443              :     case V8BF_FTYPE_V4SF:
   12444              :       nargs = 1;
   12445              :       break;
   12446           52 :     case V4SF_FTYPE_V4SF_VEC_MERGE:
   12447           52 :     case V2DF_FTYPE_V2DF_VEC_MERGE:
   12448           52 :       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
   12449         9504 :     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
   12450         9504 :     case V16QI_FTYPE_V16QI_V16QI:
   12451         9504 :     case V16QI_FTYPE_V8HI_V8HI:
   12452         9504 :     case V16HF_FTYPE_V16HF_V16HF:
   12453         9504 :     case V16SF_FTYPE_V16SF_V16SF:
   12454         9504 :     case V16SI_FTYPE_V16SI_V16SI:
   12455         9504 :     case V8QI_FTYPE_V8QI_V8QI:
   12456         9504 :     case V8QI_FTYPE_V4HI_V4HI:
   12457         9504 :     case V8HI_FTYPE_V8HI_V8HI:
   12458         9504 :     case V8HI_FTYPE_V16QI_V16QI:
   12459         9504 :     case V8HI_FTYPE_V4SI_V4SI:
   12460         9504 :     case V8HF_FTYPE_V8HF_V8HF:
   12461         9504 :     case V8SF_FTYPE_V8SF_V8SF:
   12462         9504 :     case V8SF_FTYPE_V8SF_V8SI:
   12463         9504 :     case V8DF_FTYPE_V8DF_V8DF:
   12464         9504 :     case V4SI_FTYPE_V4SI_V4SI:
   12465         9504 :     case V4SI_FTYPE_V8HI_V8HI:
   12466         9504 :     case V4SI_FTYPE_V2DF_V2DF:
   12467         9504 :     case V4HI_FTYPE_V4HI_V4HI:
   12468         9504 :     case V4HI_FTYPE_V8QI_V8QI:
   12469         9504 :     case V4HI_FTYPE_V2SI_V2SI:
   12470         9504 :     case V4DF_FTYPE_V4DF_V4DF:
   12471         9504 :     case V4DF_FTYPE_V4DF_V4DI:
   12472         9504 :     case V4SF_FTYPE_V4SF_V4SF:
   12473         9504 :     case V4SF_FTYPE_V4SF_V4SI:
   12474         9504 :     case V4SF_FTYPE_V4SF_V2SI:
   12475         9504 :     case V4SF_FTYPE_V4SF_V2DF:
   12476         9504 :     case V4SF_FTYPE_V4SF_UINT:
   12477         9504 :     case V4SF_FTYPE_V4SF_DI:
   12478         9504 :     case V4SF_FTYPE_V4SF_SI:
   12479         9504 :     case V4DI_FTYPE_V4DI_V2DI:
   12480         9504 :     case V2DI_FTYPE_V2DI_V2DI:
   12481         9504 :     case V2DI_FTYPE_V16QI_V16QI:
   12482         9504 :     case V2DI_FTYPE_V4SI_V4SI:
   12483         9504 :     case V2DI_FTYPE_V2DI_V16QI:
   12484         9504 :     case V2SI_FTYPE_V2SI_V2SI:
   12485         9504 :     case V2SI_FTYPE_V4HI_V4HI:
   12486         9504 :     case V2SI_FTYPE_V2SF_V2SF:
   12487         9504 :     case V2DF_FTYPE_V2DF_V2DF:
   12488         9504 :     case V2DF_FTYPE_V2DF_V4SF:
   12489         9504 :     case V2DF_FTYPE_V2DF_V2DI:
   12490         9504 :     case V2DF_FTYPE_V2DF_DI:
   12491         9504 :     case V2DF_FTYPE_V2DF_SI:
   12492         9504 :     case V2DF_FTYPE_V2DF_UINT:
   12493         9504 :     case V2SF_FTYPE_V2SF_V2SF:
   12494         9504 :     case V1DI_FTYPE_V1DI_V1DI:
   12495         9504 :     case V1DI_FTYPE_V8QI_V8QI:
   12496         9504 :     case V1DI_FTYPE_V2SI_V2SI:
   12497         9504 :     case V32QI_FTYPE_V16HI_V16HI:
   12498         9504 :     case V16HI_FTYPE_V8SI_V8SI:
   12499         9504 :     case V64QI_FTYPE_V64QI_V64QI:
   12500         9504 :     case V32QI_FTYPE_V32QI_V32QI:
   12501         9504 :     case V32BF_FTYPE_V32BF_V32BF:
   12502         9504 :     case V16BF_FTYPE_V16BF_V16BF:
   12503         9504 :     case V8BF_FTYPE_V8BF_V8BF:
   12504         9504 :     case V16HI_FTYPE_V32QI_V32QI:
   12505         9504 :     case V16HI_FTYPE_V16HI_V16HI:
   12506         9504 :     case V8SI_FTYPE_V4DF_V4DF:
   12507         9504 :     case V8SI_FTYPE_V8SI_V8SI:
   12508         9504 :     case V8SI_FTYPE_V16HI_V16HI:
   12509         9504 :     case V4DI_FTYPE_V4DI_V4DI:
   12510         9504 :     case V4DI_FTYPE_V8SI_V8SI:
   12511         9504 :     case V4DI_FTYPE_V32QI_V32QI:
   12512         9504 :     case V8DI_FTYPE_V64QI_V64QI:
   12513         9504 :       if (comparison == UNKNOWN)
   12514         8970 :         return ix86_expand_binop_builtin (icode, exp, target);
   12515              :       nargs = 2;
   12516              :       break;
   12517           80 :     case V4SF_FTYPE_V4SF_V4SF_SWAP:
   12518           80 :     case V2DF_FTYPE_V2DF_V2DF_SWAP:
   12519           80 :       gcc_assert (comparison != UNKNOWN);
   12520              :       nargs = 2;
   12521              :       swap = true;
   12522              :       break;
   12523         1481 :     case V16HI_FTYPE_V16HI_V8HI_COUNT:
   12524         1481 :     case V16HI_FTYPE_V16HI_SI_COUNT:
   12525         1481 :     case V8SI_FTYPE_V8SI_V4SI_COUNT:
   12526         1481 :     case V8SI_FTYPE_V8SI_SI_COUNT:
   12527         1481 :     case V4DI_FTYPE_V4DI_V2DI_COUNT:
   12528         1481 :     case V4DI_FTYPE_V4DI_INT_COUNT:
   12529         1481 :     case V8HI_FTYPE_V8HI_V8HI_COUNT:
   12530         1481 :     case V8HI_FTYPE_V8HI_SI_COUNT:
   12531         1481 :     case V4SI_FTYPE_V4SI_V4SI_COUNT:
   12532         1481 :     case V4SI_FTYPE_V4SI_SI_COUNT:
   12533         1481 :     case V4HI_FTYPE_V4HI_V4HI_COUNT:
   12534         1481 :     case V4HI_FTYPE_V4HI_SI_COUNT:
   12535         1481 :     case V2DI_FTYPE_V2DI_V2DI_COUNT:
   12536         1481 :     case V2DI_FTYPE_V2DI_SI_COUNT:
   12537         1481 :     case V2SI_FTYPE_V2SI_V2SI_COUNT:
   12538         1481 :     case V2SI_FTYPE_V2SI_SI_COUNT:
   12539         1481 :     case V1DI_FTYPE_V1DI_V1DI_COUNT:
   12540         1481 :     case V1DI_FTYPE_V1DI_SI_COUNT:
   12541         1481 :       nargs = 2;
   12542         1481 :       second_arg_count = true;
   12543         1481 :       break;
   12544         1408 :     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
   12545         1408 :     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
   12546         1408 :     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
   12547         1408 :     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
   12548         1408 :     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
   12549         1408 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
   12550         1408 :     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
   12551         1408 :     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
   12552         1408 :     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
   12553         1408 :     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
   12554         1408 :     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
   12555         1408 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
   12556         1408 :     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
   12557         1408 :     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
   12558         1408 :     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
   12559         1408 :     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
   12560         1408 :     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
   12561         1408 :     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
   12562         1408 :       nargs = 4;
   12563         1408 :       second_arg_count = true;
   12564         1408 :       break;
   12565          966 :     case UINT64_FTYPE_UINT64_UINT64:
   12566          966 :     case UINT_FTYPE_UINT_UINT:
   12567          966 :     case UINT_FTYPE_UINT_USHORT:
   12568          966 :     case UINT_FTYPE_UINT_UCHAR:
   12569          966 :     case UINT16_FTYPE_UINT16_INT:
   12570          966 :     case UINT8_FTYPE_UINT8_INT:
   12571          966 :     case UQI_FTYPE_UQI_UQI:
   12572          966 :     case UHI_FTYPE_UHI_UHI:
   12573          966 :     case USI_FTYPE_USI_USI:
   12574          966 :     case UDI_FTYPE_UDI_UDI:
   12575          966 :     case V16SI_FTYPE_V8DF_V8DF:
   12576          966 :     case V32BF_FTYPE_V16SF_V16SF:
   12577          966 :     case V16BF_FTYPE_V8SF_V8SF:
   12578          966 :     case V8BF_FTYPE_V4SF_V4SF:
   12579          966 :     case V16BF_FTYPE_V16SF_UHI:
   12580          966 :     case V8BF_FTYPE_V8SF_UQI:
   12581          966 :     case V8BF_FTYPE_V4SF_UQI:
   12582          966 :     case V16QI_FTYPE_V16QI_V8HF:
   12583          966 :       nargs = 2;
   12584          966 :       break;
   12585          649 :     case V2DI_FTYPE_V2DI_INT_CONVERT:
   12586          649 :       nargs = 2;
   12587          649 :       rmode = V1TImode;
   12588          649 :       nargs_constant = 1;
   12589          649 :       break;
   12590           42 :     case V4DI_FTYPE_V4DI_INT_CONVERT:
   12591           42 :       nargs = 2;
   12592           42 :       rmode = V2TImode;
   12593           42 :       nargs_constant = 1;
   12594           42 :       break;
   12595           16 :     case V8DI_FTYPE_V8DI_INT_CONVERT:
   12596           16 :       nargs = 2;
   12597           16 :       rmode = V4TImode;
   12598           16 :       nargs_constant = 1;
   12599           16 :       break;
   12600         2380 :     case V8HI_FTYPE_V8HI_INT:
   12601         2380 :     case V8HI_FTYPE_V8SF_INT:
   12602         2380 :     case V16HI_FTYPE_V16SF_INT:
   12603         2380 :     case V8HI_FTYPE_V4SF_INT:
   12604         2380 :     case V8SF_FTYPE_V8SF_INT:
   12605         2380 :     case V4SF_FTYPE_V16SF_INT:
   12606         2380 :     case V16SF_FTYPE_V16SF_INT:
   12607         2380 :     case V4SI_FTYPE_V4SI_INT:
   12608         2380 :     case V4SI_FTYPE_V8SI_INT:
   12609         2380 :     case V4HI_FTYPE_V4HI_INT:
   12610         2380 :     case V4DF_FTYPE_V4DF_INT:
   12611         2380 :     case V4DF_FTYPE_V8DF_INT:
   12612         2380 :     case V4SF_FTYPE_V4SF_INT:
   12613         2380 :     case V4SF_FTYPE_V8SF_INT:
   12614         2380 :     case V2DI_FTYPE_V2DI_INT:
   12615         2380 :     case V2DF_FTYPE_V2DF_INT:
   12616         2380 :     case V2DF_FTYPE_V4DF_INT:
   12617         2380 :     case V16HI_FTYPE_V16HI_INT:
   12618         2380 :     case V8SI_FTYPE_V8SI_INT:
   12619         2380 :     case V16SI_FTYPE_V16SI_INT:
   12620         2380 :     case V4SI_FTYPE_V16SI_INT:
   12621         2380 :     case V4DI_FTYPE_V4DI_INT:
   12622         2380 :     case V2DI_FTYPE_V4DI_INT:
   12623         2380 :     case V4DI_FTYPE_V8DI_INT:
   12624         2380 :     case UQI_FTYPE_UQI_UQI_CONST:
   12625         2380 :     case UHI_FTYPE_UHI_UQI:
   12626         2380 :     case USI_FTYPE_USI_UQI:
   12627         2380 :     case UDI_FTYPE_UDI_UQI:
   12628         2380 :       nargs = 2;
   12629         2380 :       nargs_constant = 1;
   12630         2380 :       break;
   12631        18709 :     case V16QI_FTYPE_V16QI_V16QI_V16QI:
   12632        18709 :     case V8SF_FTYPE_V8SF_V8SF_V8SF:
   12633        18709 :     case V4DF_FTYPE_V4DF_V4DF_V4DF:
   12634        18709 :     case V4SF_FTYPE_V4SF_V4SF_V4SF:
   12635        18709 :     case V2DF_FTYPE_V2DF_V2DF_V2DF:
   12636        18709 :     case V32QI_FTYPE_V32QI_V32QI_V32QI:
   12637        18709 :     case UHI_FTYPE_V16SI_V16SI_UHI:
   12638        18709 :     case UQI_FTYPE_V8DI_V8DI_UQI:
   12639        18709 :     case V16HI_FTYPE_V16SI_V16HI_UHI:
   12640        18709 :     case V16QI_FTYPE_V16SI_V16QI_UHI:
   12641        18709 :     case V16QI_FTYPE_V8DI_V16QI_UQI:
   12642        18709 :     case V32HF_FTYPE_V32HF_V32HF_USI:
   12643        18709 :     case V16SF_FTYPE_V16SF_V16SF_UHI:
   12644        18709 :     case V16SF_FTYPE_V4SF_V16SF_UHI:
   12645        18709 :     case V16SI_FTYPE_SI_V16SI_UHI:
   12646        18709 :     case V16SI_FTYPE_V16HI_V16SI_UHI:
   12647        18709 :     case V16SI_FTYPE_V16QI_V16SI_UHI:
   12648        18709 :     case V8SF_FTYPE_V4SF_V8SF_UQI:
   12649        18709 :     case V4DF_FTYPE_V2DF_V4DF_UQI:
   12650        18709 :     case V8SI_FTYPE_V4SI_V8SI_UQI:
   12651        18709 :     case V8SI_FTYPE_SI_V8SI_UQI:
   12652        18709 :     case V4SI_FTYPE_V4SI_V4SI_UQI:
   12653        18709 :     case V4SI_FTYPE_SI_V4SI_UQI:
   12654        18709 :     case V4DI_FTYPE_V2DI_V4DI_UQI:
   12655        18709 :     case V4DI_FTYPE_DI_V4DI_UQI:
   12656        18709 :     case V2DI_FTYPE_V2DI_V2DI_UQI:
   12657        18709 :     case V2DI_FTYPE_DI_V2DI_UQI:
   12658        18709 :     case V64QI_FTYPE_V64QI_V64QI_UDI:
   12659        18709 :     case V64QI_FTYPE_V16QI_V64QI_UDI:
   12660        18709 :     case V64QI_FTYPE_QI_V64QI_UDI:
   12661        18709 :     case V32QI_FTYPE_V32QI_V32QI_USI:
   12662        18709 :     case V32QI_FTYPE_V16QI_V32QI_USI:
   12663        18709 :     case V32QI_FTYPE_QI_V32QI_USI:
   12664        18709 :     case V16QI_FTYPE_V16QI_V16QI_UHI:
   12665        18709 :     case V16QI_FTYPE_QI_V16QI_UHI:
   12666        18709 :     case V32HI_FTYPE_V8HI_V32HI_USI:
   12667        18709 :     case V32HI_FTYPE_V32BF_V32HI_USI:
   12668        18709 :     case V32HI_FTYPE_HI_V32HI_USI:
   12669        18709 :     case V16HI_FTYPE_V8HI_V16HI_UHI:
   12670        18709 :     case V16HI_FTYPE_V16BF_V16HI_UHI:
   12671        18709 :     case V16HI_FTYPE_HI_V16HI_UHI:
   12672        18709 :     case V8HI_FTYPE_V8HI_V8HI_UQI:
   12673        18709 :     case V8HI_FTYPE_V8BF_V8HI_UQI:
   12674        18709 :     case V8BF_FTYPE_V8BF_V8BF_UQI:
   12675        18709 :     case V8HI_FTYPE_HI_V8HI_UQI:
   12676        18709 :     case V16HF_FTYPE_V16HF_V16HF_UHI:
   12677        18709 :     case V8SF_FTYPE_V8HI_V8SF_UQI:
   12678        18709 :     case V4SF_FTYPE_V8HI_V4SF_UQI:
   12679        18709 :     case V8SI_FTYPE_V8HF_V8SI_UQI:
   12680        18709 :     case V8SF_FTYPE_V8HF_V8SF_UQI:
   12681        18709 :     case V8SI_FTYPE_V8SF_V8SI_UQI:
   12682        18709 :     case V4SI_FTYPE_V4SF_V4SI_UQI:
   12683        18709 :     case V4SI_FTYPE_V8HF_V4SI_UQI:
   12684        18709 :     case V4SF_FTYPE_V8HF_V4SF_UQI:
   12685        18709 :     case V4DI_FTYPE_V8HF_V4DI_UQI:
   12686        18709 :     case V4DI_FTYPE_V4SF_V4DI_UQI:
   12687        18709 :     case V2DI_FTYPE_V8HF_V2DI_UQI:
   12688        18709 :     case V2DI_FTYPE_V4SF_V2DI_UQI:
   12689        18709 :     case V8HF_FTYPE_V8HF_V8HF_UQI:
   12690        18709 :     case V8HF_FTYPE_V8HF_V8HF_V8HF:
   12691        18709 :     case V8HF_FTYPE_V8HI_V8HF_UQI:
   12692        18709 :     case V8HF_FTYPE_V8SI_V8HF_UQI:
   12693        18709 :     case V8HF_FTYPE_V8SF_V8HF_UQI:
   12694        18709 :     case V8HF_FTYPE_V4SI_V8HF_UQI:
   12695        18709 :     case V8HF_FTYPE_V4SF_V8HF_UQI:
   12696        18709 :     case V8HF_FTYPE_V4DI_V8HF_UQI:
   12697        18709 :     case V8HF_FTYPE_V4DF_V8HF_UQI:
   12698        18709 :     case V8HF_FTYPE_V2DI_V8HF_UQI:
   12699        18709 :     case V8HF_FTYPE_V2DF_V8HF_UQI:
   12700        18709 :     case V4SF_FTYPE_V4DI_V4SF_UQI:
   12701        18709 :     case V4SF_FTYPE_V2DI_V4SF_UQI:
   12702        18709 :     case V4DF_FTYPE_V4DI_V4DF_UQI:
   12703        18709 :     case V4DF_FTYPE_V8HF_V4DF_UQI:
   12704        18709 :     case V2DF_FTYPE_V8HF_V2DF_UQI:
   12705        18709 :     case V2DF_FTYPE_V2DI_V2DF_UQI:
   12706        18709 :     case V16QI_FTYPE_V8HI_V16QI_UQI:
   12707        18709 :     case V16QI_FTYPE_V16HI_V16QI_UHI:
   12708        18709 :     case V16QI_FTYPE_V4SI_V16QI_UQI:
   12709        18709 :     case V16QI_FTYPE_V8SI_V16QI_UQI:
   12710        18709 :     case V8HI_FTYPE_V8HF_V8HI_UQI:
   12711        18709 :     case V8HI_FTYPE_V4SI_V8HI_UQI:
   12712        18709 :     case V8HI_FTYPE_V8SI_V8HI_UQI:
   12713        18709 :     case V16QI_FTYPE_V2DI_V16QI_UQI:
   12714        18709 :     case V16QI_FTYPE_V4DI_V16QI_UQI:
   12715        18709 :     case V8HI_FTYPE_V2DI_V8HI_UQI:
   12716        18709 :     case V8HI_FTYPE_V4DI_V8HI_UQI:
   12717        18709 :     case V4SI_FTYPE_V2DI_V4SI_UQI:
   12718        18709 :     case V4SI_FTYPE_V4DI_V4SI_UQI:
   12719        18709 :     case V32QI_FTYPE_V32HI_V32QI_USI:
   12720        18709 :     case UHI_FTYPE_V16QI_V16QI_UHI:
   12721        18709 :     case USI_FTYPE_V32QI_V32QI_USI:
   12722        18709 :     case UDI_FTYPE_V64QI_V64QI_UDI:
   12723        18709 :     case UQI_FTYPE_V8HI_V8HI_UQI:
   12724        18709 :     case UHI_FTYPE_V16HI_V16HI_UHI:
   12725        18709 :     case USI_FTYPE_V32HI_V32HI_USI:
   12726        18709 :     case UQI_FTYPE_V4SI_V4SI_UQI:
   12727        18709 :     case UQI_FTYPE_V8SI_V8SI_UQI:
   12728        18709 :     case UQI_FTYPE_V2DI_V2DI_UQI:
   12729        18709 :     case UQI_FTYPE_V4DI_V4DI_UQI:
   12730        18709 :     case V4SF_FTYPE_V2DF_V4SF_UQI:
   12731        18709 :     case V4SF_FTYPE_V4DF_V4SF_UQI:
   12732        18709 :     case V16SI_FTYPE_V16SI_V16SI_UHI:
   12733        18709 :     case V16SI_FTYPE_V4SI_V16SI_UHI:
   12734        18709 :     case V2DI_FTYPE_V4SI_V2DI_UQI:
   12735        18709 :     case V2DI_FTYPE_V8HI_V2DI_UQI:
   12736        18709 :     case V2DI_FTYPE_V16QI_V2DI_UQI:
   12737        18709 :     case V4DI_FTYPE_V4DI_V4DI_UQI:
   12738        18709 :     case V4DI_FTYPE_V4SI_V4DI_UQI:
   12739        18709 :     case V4DI_FTYPE_V8HI_V4DI_UQI:
   12740        18709 :     case V4DI_FTYPE_V16QI_V4DI_UQI:
   12741        18709 :     case V4DI_FTYPE_V4DF_V4DI_UQI:
   12742        18709 :     case V2DI_FTYPE_V2DF_V2DI_UQI:
   12743        18709 :     case V4SI_FTYPE_V4DF_V4SI_UQI:
   12744        18709 :     case V4SI_FTYPE_V2DF_V4SI_UQI:
   12745        18709 :     case V4SI_FTYPE_V8HI_V4SI_UQI:
   12746        18709 :     case V4SI_FTYPE_V16QI_V4SI_UQI:
   12747        18709 :     case V4DI_FTYPE_V4DI_V4DI_V4DI:
   12748        18709 :     case V8DF_FTYPE_V2DF_V8DF_UQI:
   12749        18709 :     case V8DF_FTYPE_V4DF_V8DF_UQI:
   12750        18709 :     case V8DF_FTYPE_V8DF_V8DF_UQI:
   12751        18709 :     case V8SF_FTYPE_V8SF_V8SF_UQI:
   12752        18709 :     case V8SF_FTYPE_V8SI_V8SF_UQI:
   12753        18709 :     case V4DF_FTYPE_V4DF_V4DF_UQI:
   12754        18709 :     case V4SF_FTYPE_V4SF_V4SF_UQI:
   12755        18709 :     case V2DF_FTYPE_V2DF_V2DF_UQI:
   12756        18709 :     case V2DF_FTYPE_V4SF_V2DF_UQI:
   12757        18709 :     case V2DF_FTYPE_V4SI_V2DF_UQI:
   12758        18709 :     case V4SF_FTYPE_V4SI_V4SF_UQI:
   12759        18709 :     case V4DF_FTYPE_V4SF_V4DF_UQI:
   12760        18709 :     case V4DF_FTYPE_V4SI_V4DF_UQI:
   12761        18709 :     case V8SI_FTYPE_V8SI_V8SI_UQI:
   12762        18709 :     case V8SI_FTYPE_V8HI_V8SI_UQI:
   12763        18709 :     case V8SI_FTYPE_V16QI_V8SI_UQI:
   12764        18709 :     case V8DF_FTYPE_V8SI_V8DF_UQI:
   12765        18709 :     case V8DI_FTYPE_DI_V8DI_UQI:
   12766        18709 :     case V16SF_FTYPE_V8SF_V16SF_UHI:
   12767        18709 :     case V16SI_FTYPE_V8SI_V16SI_UHI:
   12768        18709 :     case V16HF_FTYPE_V16HI_V16HF_UHI:
   12769        18709 :     case V16HF_FTYPE_V16HF_V16HF_V16HF:
   12770        18709 :     case V16HI_FTYPE_V16HF_V16HI_UHI:
   12771        18709 :     case V16HI_FTYPE_V16HI_V16HI_UHI:
   12772        18709 :     case V16BF_FTYPE_V16BF_V16BF_UHI:
   12773        18709 :     case V8HI_FTYPE_V16QI_V8HI_UQI:
   12774        18709 :     case V16HI_FTYPE_V16QI_V16HI_UHI:
   12775        18709 :     case V32HI_FTYPE_V32HI_V32HI_USI:
   12776        18709 :     case V32BF_FTYPE_V32BF_V32BF_USI:
   12777        18709 :     case V32HI_FTYPE_V32QI_V32HI_USI:
   12778        18709 :     case V8DI_FTYPE_V16QI_V8DI_UQI:
   12779        18709 :     case V8DI_FTYPE_V2DI_V8DI_UQI:
   12780        18709 :     case V8DI_FTYPE_V4DI_V8DI_UQI:
   12781        18709 :     case V8DI_FTYPE_V8DI_V8DI_UQI:
   12782        18709 :     case V8DI_FTYPE_V8HI_V8DI_UQI:
   12783        18709 :     case V8DI_FTYPE_V8SI_V8DI_UQI:
   12784        18709 :     case V8HI_FTYPE_V8DI_V8HI_UQI:
   12785        18709 :     case V8SI_FTYPE_V8DI_V8SI_UQI:
   12786        18709 :     case V4SI_FTYPE_V4SI_V4SI_V4SI:
   12787        18709 :     case V4DI_FTYPE_V4DI_V4DI_V2DI:
   12788        18709 :     case V16SI_FTYPE_V16SI_V16SI_V16SI:
   12789        18709 :     case V8DI_FTYPE_V8DI_V8DI_V8DI:
   12790        18709 :     case V32HI_FTYPE_V32HI_V32HI_V32HI:
   12791        18709 :     case V2DI_FTYPE_V2DI_V2DI_V2DI:
   12792        18709 :     case V16HI_FTYPE_V16HI_V16HI_V16HI:
   12793        18709 :     case V8SI_FTYPE_V8SI_V8SI_V8SI:
   12794        18709 :     case V8HI_FTYPE_V8HI_V8HI_V8HI:
   12795        18709 :     case V32BF_FTYPE_V16SF_V16SF_USI:
   12796        18709 :     case V16BF_FTYPE_V8SF_V8SF_UHI:
   12797        18709 :     case V8BF_FTYPE_V4SF_V4SF_UQI:
   12798        18709 :     case V16BF_FTYPE_V16SF_V16BF_UHI:
   12799        18709 :     case V8BF_FTYPE_V8SF_V8BF_UQI:
   12800        18709 :     case V8BF_FTYPE_V4SF_V8BF_UQI:
   12801        18709 :     case V16SF_FTYPE_V16SF_V32BF_V32BF:
   12802        18709 :     case V8SF_FTYPE_V8SF_V16BF_V16BF:
   12803        18709 :     case V4SF_FTYPE_V4SF_V8BF_V8BF:
   12804        18709 :     case V16QI_FTYPE_V16QI_V8HF_V8HF:
   12805        18709 :     case V32QI_FTYPE_V32QI_V16HF_V16HF:
   12806        18709 :     case V64QI_FTYPE_V64QI_V32HF_V32HF:
   12807        18709 :     case V16QI_FTYPE_V8HF_V16QI_UQI:
   12808        18709 :     case V16QI_FTYPE_V16HF_V16QI_UHI:
   12809        18709 :     case V32QI_FTYPE_V32HF_V32QI_USI:
   12810        18709 :     case V8HF_FTYPE_V16QI_V8HF_UQI:
   12811        18709 :     case V16HF_FTYPE_V16QI_V16HF_UHI:
   12812        18709 :     case V32HF_FTYPE_V32QI_V32HF_USI:
   12813        18709 :     case V16SI_FTYPE_V16SF_V16SI_UHI:
   12814        18709 :     case V32HI_FTYPE_V32HF_V32HI_USI:
   12815        18709 :     case V8DI_FTYPE_V8SF_V8DI_UQI:
   12816        18709 :     case V8DI_FTYPE_V8DF_V8DI_UQI:
   12817        18709 :     case V8SI_FTYPE_V8DF_V8SI_UQI:
   12818        18709 :       nargs = 3;
   12819        18709 :       break;
   12820         1479 :     case V32QI_FTYPE_V32QI_V32QI_INT:
   12821         1479 :     case V16HI_FTYPE_V16HI_V16HI_INT:
   12822         1479 :     case V16QI_FTYPE_V16QI_V16QI_INT:
   12823         1479 :     case V4DI_FTYPE_V4DI_V4DI_INT:
   12824         1479 :     case V8HI_FTYPE_V8HI_V8HI_INT:
   12825         1479 :     case V8SI_FTYPE_V8SI_V8SI_INT:
   12826         1479 :     case V8SI_FTYPE_V8SI_V4SI_INT:
   12827         1479 :     case V8SF_FTYPE_V8SF_V8SF_INT:
   12828         1479 :     case V8SF_FTYPE_V8SF_V4SF_INT:
   12829         1479 :     case V4SI_FTYPE_V4SI_V4SI_INT:
   12830         1479 :     case V4DF_FTYPE_V4DF_V4DF_INT:
   12831         1479 :     case V16SF_FTYPE_V16SF_V16SF_INT:
   12832         1479 :     case V16SF_FTYPE_V16SF_V4SF_INT:
   12833         1479 :     case V16SI_FTYPE_V16SI_V4SI_INT:
   12834         1479 :     case V4DF_FTYPE_V4DF_V2DF_INT:
   12835         1479 :     case V4SF_FTYPE_V4SF_V4SF_INT:
   12836         1479 :     case V2DI_FTYPE_V2DI_V2DI_INT:
   12837         1479 :     case V4DI_FTYPE_V4DI_V2DI_INT:
   12838         1479 :     case V2DF_FTYPE_V2DF_V2DF_INT:
   12839         1479 :     case UQI_FTYPE_V8DI_V8UDI_INT:
   12840         1479 :     case UQI_FTYPE_V8DF_V8DF_INT:
   12841         1479 :     case UQI_FTYPE_V2DF_V2DF_INT:
   12842         1479 :     case UQI_FTYPE_V4SF_V4SF_INT:
   12843         1479 :     case UHI_FTYPE_V16SI_V16SI_INT:
   12844         1479 :     case UHI_FTYPE_V16SF_V16SF_INT:
   12845         1479 :     case V64QI_FTYPE_V64QI_V64QI_INT:
   12846         1479 :     case V32HI_FTYPE_V32HI_V32HI_INT:
   12847         1479 :     case V16SI_FTYPE_V16SI_V16SI_INT:
   12848         1479 :     case V8DI_FTYPE_V8DI_V8DI_INT:
   12849         1479 :       nargs = 3;
   12850         1479 :       nargs_constant = 1;
   12851         1479 :       break;
   12852           47 :     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
   12853           47 :       nargs = 3;
   12854           47 :       rmode = V4DImode;
   12855           47 :       nargs_constant = 1;
   12856           47 :       break;
   12857           80 :     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
   12858           80 :       nargs = 3;
   12859           80 :       rmode = V2DImode;
   12860           80 :       nargs_constant = 1;
   12861           80 :       break;
   12862           48 :     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
   12863           48 :       nargs = 3;
   12864           48 :       rmode = DImode;
   12865           48 :       nargs_constant = 1;
   12866           48 :       break;
   12867           20 :     case V2DI_FTYPE_V2DI_UINT_UINT:
   12868           20 :       nargs = 3;
   12869           20 :       nargs_constant = 2;
   12870           20 :       break;
   12871            8 :     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
   12872            8 :       nargs = 3;
   12873            8 :       rmode = V8DImode;
   12874            8 :       nargs_constant = 1;
   12875            8 :       break;
   12876           16 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
   12877           16 :       nargs = 5;
   12878           16 :       rmode = V8DImode;
   12879           16 :       mask_pos = 2;
   12880           16 :       nargs_constant = 1;
   12881           16 :       break;
   12882          320 :     case QI_FTYPE_V8DF_INT_UQI:
   12883          320 :     case QI_FTYPE_V4DF_INT_UQI:
   12884          320 :     case QI_FTYPE_V2DF_INT_UQI:
   12885          320 :     case HI_FTYPE_V16SF_INT_UHI:
   12886          320 :     case QI_FTYPE_V8SF_INT_UQI:
   12887          320 :     case QI_FTYPE_V4SF_INT_UQI:
   12888          320 :     case QI_FTYPE_V8HF_INT_UQI:
   12889          320 :     case HI_FTYPE_V16HF_INT_UHI:
   12890          320 :     case SI_FTYPE_V32HF_INT_USI:
   12891          320 :     case QI_FTYPE_V8BF_INT_UQI:
   12892          320 :     case HI_FTYPE_V16BF_INT_UHI:
   12893          320 :     case SI_FTYPE_V32BF_INT_USI:
   12894          320 :     case V4SI_FTYPE_V4SI_V4SI_UHI:
   12895          320 :     case V8SI_FTYPE_V8SI_V8SI_UHI:
   12896          320 :       nargs = 3;
   12897          320 :       mask_pos = 1;
   12898          320 :       nargs_constant = 1;
   12899          320 :       break;
   12900           17 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
   12901           17 :       nargs = 5;
   12902           17 :       rmode = V4DImode;
   12903           17 :       mask_pos = 2;
   12904           17 :       nargs_constant = 1;
   12905           17 :       break;
   12906           17 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
   12907           17 :       nargs = 5;
   12908           17 :       rmode = V2DImode;
   12909           17 :       mask_pos = 2;
   12910           17 :       nargs_constant = 1;
   12911           17 :       break;
   12912        17242 :     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
   12913        17242 :     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
   12914        17242 :     case V32BF_FTYPE_V32BF_V32BF_V32BF_USI:
   12915        17242 :     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
   12916        17242 :     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
   12917        17242 :     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
   12918        17242 :     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
   12919        17242 :     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
   12920        17242 :     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
   12921        17242 :     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
   12922        17242 :     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
   12923        17242 :     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
   12924        17242 :     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
   12925        17242 :     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
   12926        17242 :     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
   12927        17242 :     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
   12928        17242 :     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
   12929        17242 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
   12930        17242 :     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
   12931        17242 :     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
   12932        17242 :     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
   12933        17242 :     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
   12934        17242 :     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
   12935        17242 :     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
   12936        17242 :     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
   12937        17242 :     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
   12938        17242 :     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
   12939        17242 :     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
   12940        17242 :     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
   12941        17242 :     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
   12942        17242 :     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
   12943        17242 :     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
   12944        17242 :     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
   12945        17242 :     case V8BF_FTYPE_V8BF_V8BF_V8BF_UQI:
   12946        17242 :     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
   12947        17242 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
   12948        17242 :     case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
   12949        17242 :     case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
   12950        17242 :     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
   12951        17242 :     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
   12952        17242 :     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
   12953        17242 :     case V16BF_FTYPE_V16BF_V16BF_V16BF_UHI:
   12954        17242 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
   12955        17242 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
   12956        17242 :     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
   12957        17242 :     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
   12958        17242 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
   12959        17242 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
   12960        17242 :     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
   12961        17242 :     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
   12962        17242 :     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
   12963        17242 :     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
   12964        17242 :     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
   12965        17242 :     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
   12966        17242 :     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
   12967        17242 :     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
   12968        17242 :     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
   12969        17242 :     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
   12970        17242 :     case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
   12971        17242 :     case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
   12972        17242 :     case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
   12973        17242 :     case V32HF_FTYPE_V16SF_V16SF_V32HF_USI:
   12974        17242 :     case V16HF_FTYPE_V8SF_V8SF_V16HF_UHI:
   12975        17242 :     case V8HF_FTYPE_V4SF_V4SF_V8HF_UQI:
   12976        17242 :     case V16QI_FTYPE_V8HF_V8HF_V16QI_UHI:
   12977        17242 :     case V32QI_FTYPE_V16HF_V16HF_V32QI_USI:
   12978        17242 :     case V64QI_FTYPE_V32HF_V32HF_V64QI_UDI:
   12979        17242 :     case V16QI_FTYPE_V16QI_V8HF_V16QI_UHI:
   12980        17242 :     case V16QI_FTYPE_V32QI_V16HF_V16QI_UHI:
   12981        17242 :     case V32QI_FTYPE_V64QI_V32HF_V32QI_USI:
   12982        17242 :       nargs = 4;
   12983        17242 :       break;
   12984           11 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
   12985           11 :     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
   12986           11 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
   12987           11 :     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
   12988           11 :     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
   12989           11 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
   12990           11 :       nargs = 4;
   12991           11 :       nargs_constant = 1;
   12992           11 :       break;
   12993         3718 :     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
   12994         3718 :     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
   12995         3718 :     case QI_FTYPE_V4DF_V4DF_INT_UQI:
   12996         3718 :     case QI_FTYPE_V8SF_V8SF_INT_UQI:
   12997         3718 :     case UHI_FTYPE_V16HF_V16HF_INT_UHI:
   12998         3718 :     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
   12999         3718 :     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
   13000         3718 :     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
   13001         3718 :     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
   13002         3718 :     case UQI_FTYPE_V8HF_V8HF_INT_UQI:
   13003         3718 :     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
   13004         3718 :     case USI_FTYPE_V32QI_V32QI_INT_USI:
   13005         3718 :     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
   13006         3718 :     case USI_FTYPE_V32HI_V32HI_INT_USI:
   13007         3718 :     case USI_FTYPE_V32BF_V32BF_INT_USI:
   13008         3718 :     case USI_FTYPE_V32HF_V32HF_INT_USI:
   13009         3718 :     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
   13010         3718 :     case UHI_FTYPE_V16BF_V16BF_INT_UHI:
   13011         3718 :     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
   13012         3718 :     case UQI_FTYPE_V8BF_V8BF_INT_UQI:
   13013         3718 :       nargs = 4;
   13014         3718 :       mask_pos = 1;
   13015         3718 :       nargs_constant = 1;
   13016         3718 :       break;
   13017           23 :     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
   13018           23 :       nargs = 4;
   13019           23 :       nargs_constant = 2;
   13020           23 :       break;
   13021           67 :     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
   13022           67 :     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
   13023           67 :     case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
   13024           67 :     case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
   13025           67 :     case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
   13026           67 :       nargs = 4;
   13027           67 :       break;
   13028          679 :     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
   13029          679 :     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
   13030          679 :       mask_pos = 1;
   13031          679 :       nargs = 4;
   13032          679 :       nargs_constant = 1;
   13033          679 :       break;
   13034         3948 :     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
   13035         3948 :     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
   13036         3948 :     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
   13037         3948 :     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
   13038         3948 :     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
   13039         3948 :     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
   13040         3948 :     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
   13041         3948 :     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
   13042         3948 :     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
   13043         3948 :     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
   13044         3948 :     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
   13045         3948 :     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
   13046         3948 :     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
   13047         3948 :     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
   13048         3948 :     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
   13049         3948 :     case V32BF_FTYPE_V32BF_INT_V32BF_USI:
   13050         3948 :     case V16BF_FTYPE_V16BF_INT_V16BF_UHI:
   13051         3948 :     case V8BF_FTYPE_V8BF_INT_V8BF_UQI:
   13052         3948 :     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
   13053         3948 :     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
   13054         3948 :     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
   13055         3948 :     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
   13056         3948 :     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
   13057         3948 :     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
   13058         3948 :     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
   13059         3948 :     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
   13060         3948 :     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
   13061         3948 :     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
   13062         3948 :     case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
   13063         3948 :     case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
   13064         3948 :     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
   13065         3948 :     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
   13066         3948 :     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
   13067         3948 :     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
   13068         3948 :     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
   13069         3948 :       nargs = 4;
   13070         3948 :       mask_pos = 2;
   13071         3948 :       nargs_constant = 1;
   13072         3948 :       break;
   13073         1726 :     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
   13074         1726 :     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
   13075         1726 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
   13076         1726 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
   13077         1726 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
   13078         1726 :     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
   13079         1726 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
   13080         1726 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
   13081         1726 :     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
   13082         1726 :     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
   13083         1726 :     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
   13084         1726 :     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
   13085         1726 :     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
   13086         1726 :     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
   13087         1726 :     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
   13088         1726 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
   13089         1726 :     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
   13090         1726 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
   13091         1726 :     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
   13092         1726 :     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
   13093         1726 :     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
   13094         1726 :     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
   13095         1726 :     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
   13096         1726 :     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
   13097         1726 :     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
   13098         1726 :     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
   13099         1726 :     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
   13100         1726 :       nargs = 5;
   13101         1726 :       mask_pos = 2;
   13102         1726 :       nargs_constant = 1;
   13103         1726 :       break;
   13104          268 :     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
   13105          268 :     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
   13106          268 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
   13107          268 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
   13108          268 :     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
   13109          268 :     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
   13110          268 :     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
   13111          268 :     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
   13112          268 :     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
   13113          268 :     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
   13114          268 :       nargs = 5;
   13115          268 :       mask_pos = 1;
   13116          268 :       nargs_constant = 1;
   13117          268 :       break;
   13118          732 :     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
   13119          732 :     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
   13120          732 :     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
   13121          732 :     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
   13122          732 :     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
   13123          732 :     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
   13124          732 :     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
   13125          732 :     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
   13126          732 :     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
   13127          732 :     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
   13128          732 :     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
   13129          732 :     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
   13130          732 :     case V8BF_FTYPE_V8BF_V8BF_INT_V8BF_UQI:
   13131          732 :     case V16BF_FTYPE_V16BF_V16BF_INT_V16BF_UHI:
   13132          732 :     case V32BF_FTYPE_V32BF_V32BF_INT_V32BF_USI:
   13133          732 :     case V16HF_FTYPE_V16HF_V16HF_INT_V16HF_UHI:
   13134          732 :     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI:
   13135          732 :       nargs = 5;
   13136          732 :       mask_pos = 1;
   13137          732 :       nargs_constant = 2;
   13138          732 :       break;
   13139              : 
   13140            0 :     default:
   13141            0 :       gcc_unreachable ();
   13142              :     }
   13143              : 
   13144        56117 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   13145              : 
   13146        60124 :   if (comparison != UNKNOWN)
   13147              :     {
   13148          614 :       gcc_assert (nargs == 2);
   13149          614 :       return ix86_expand_sse_compare (d, exp, target, swap);
   13150              :     }
   13151              : 
   13152        59510 :   if (rmode == VOIDmode || rmode == tmode)
   13153              :     {
   13154        59325 :       if (optimize
   13155        17715 :           || target == 0
   13156        17715 :           || GET_MODE (target) != tmode
   13157        76838 :           || !insn_p->operand[0].predicate (target, tmode))
   13158        41900 :         target = gen_reg_rtx (tmode);
   13159        17425 :       else if (memory_operand (target, tmode))
   13160          578 :         num_memory++;
   13161              :       real_target = target;
   13162              :     }
   13163              :   else
   13164              :     {
   13165          185 :       real_target = gen_reg_rtx (tmode);
   13166          185 :       target = lowpart_subreg (rmode, real_target, tmode);
   13167              :     }
   13168              : 
   13169       257967 :   for (i = 0; i < nargs; i++)
   13170              :     {
   13171       198690 :       tree arg = CALL_EXPR_ARG (exp, i);
   13172       198690 :       rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
   13173       198690 :       machine_mode mode = insn_p->operand[i + 1].mode;
   13174              :       /* Need to fixup modeless constant before testing predicate.  */
   13175       198690 :       op = fixup_modeless_constant (op, mode);
   13176       198690 :       bool match = insn_p->operand[i + 1].predicate (op, mode);
   13177              : 
   13178       198690 :       if (second_arg_count && i == 1)
   13179              :         {
   13180              :           /* SIMD shift insns take either an 8-bit immediate or
   13181              :              register as count.  But builtin functions take int as
   13182              :              count.  If count doesn't match, we put it in register.
   13183              :              The instructions are using 64-bit count, if op is just
   13184              :              32-bit, zero-extend it, as negative shift counts
   13185              :              are undefined behavior and zero-extension is more
   13186              :              efficient.  */
   13187         2889 :           if (!match)
   13188              :             {
   13189         1750 :               if (SCALAR_INT_MODE_P (GET_MODE (op)))
   13190          489 :                 op = convert_modes (mode, GET_MODE (op), op, 1);
   13191              :               else
   13192         1261 :                 op = lowpart_subreg (mode, op, GET_MODE (op));
   13193         1750 :               if (!insn_p->operand[i + 1].predicate (op, mode))
   13194          190 :                 op = copy_to_reg (op);
   13195              :             }
   13196              :         }
   13197       195801 :       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
   13198       147745 :                (!mask_pos && (nargs - i) <= nargs_constant))
   13199              :         {
   13200        16283 :           if (!match)
   13201          233 :             switch (icode)
   13202              :               {
   13203            2 :               case CODE_FOR_avx_vinsertf128v4di:
   13204            2 :               case CODE_FOR_avx_vextractf128v4di:
   13205            2 :                 error ("the last argument must be an 1-bit immediate");
   13206            2 :                 return const0_rtx;
   13207              : 
   13208            8 :               case CODE_FOR_avx512f_cmpv8di3_mask:
   13209            8 :               case CODE_FOR_avx512f_cmpv16si3_mask:
   13210            8 :               case CODE_FOR_avx512f_ucmpv8di3_mask:
   13211            8 :               case CODE_FOR_avx512f_ucmpv16si3_mask:
   13212            8 :               case CODE_FOR_avx512vl_cmpv4di3_mask:
   13213            8 :               case CODE_FOR_avx512vl_cmpv8si3_mask:
   13214            8 :               case CODE_FOR_avx512vl_ucmpv4di3_mask:
   13215            8 :               case CODE_FOR_avx512vl_ucmpv8si3_mask:
   13216            8 :               case CODE_FOR_avx512vl_cmpv2di3_mask:
   13217            8 :               case CODE_FOR_avx512vl_cmpv4si3_mask:
   13218            8 :               case CODE_FOR_avx512vl_ucmpv2di3_mask:
   13219            8 :               case CODE_FOR_avx512vl_ucmpv4si3_mask:
   13220            8 :                 error ("the last argument must be a 3-bit immediate");
   13221            8 :                 return const0_rtx;
   13222              : 
   13223           24 :               case CODE_FOR_sse4_1_roundsd:
   13224           24 :               case CODE_FOR_sse4_1_roundss:
   13225              : 
   13226           24 :               case CODE_FOR_sse4_1_roundpd:
   13227           24 :               case CODE_FOR_sse4_1_roundps:
   13228           24 :               case CODE_FOR_avx_roundpd256:
   13229           24 :               case CODE_FOR_avx_roundps256:
   13230              : 
   13231           24 :               case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
   13232           24 :               case CODE_FOR_sse4_1_roundps_sfix:
   13233           24 :               case CODE_FOR_avx_roundpd_vec_pack_sfix256:
   13234           24 :               case CODE_FOR_avx_roundps_sfix256:
   13235              : 
   13236           24 :               case CODE_FOR_sse4_1_blendps:
   13237           24 :               case CODE_FOR_avx_blendpd256:
   13238           24 :               case CODE_FOR_avx_vpermilv4df:
   13239           24 :               case CODE_FOR_avx_vpermilv4df_mask:
   13240           24 :               case CODE_FOR_avx512f_getmantv8df_mask:
   13241           24 :               case CODE_FOR_avx512f_getmantv16sf_mask:
   13242           24 :               case CODE_FOR_avx512vl_getmantv16hf_mask:
   13243           24 :               case CODE_FOR_avx512vl_getmantv8sf_mask:
   13244           24 :               case CODE_FOR_avx512vl_getmantv4df_mask:
   13245           24 :               case CODE_FOR_avx512fp16_getmantv8hf_mask:
   13246           24 :               case CODE_FOR_avx512vl_getmantv4sf_mask:
   13247           24 :               case CODE_FOR_avx512vl_getmantv2df_mask:
   13248           24 :               case CODE_FOR_avx512dq_rangepv8df_mask_round:
   13249           24 :               case CODE_FOR_avx512dq_rangepv16sf_mask_round:
   13250           24 :               case CODE_FOR_avx512dq_rangepv4df_mask:
   13251           24 :               case CODE_FOR_avx512dq_rangepv8sf_mask:
   13252           24 :               case CODE_FOR_avx512dq_rangepv2df_mask:
   13253           24 :               case CODE_FOR_avx512dq_rangepv4sf_mask:
   13254           24 :               case CODE_FOR_avx_shufpd256_mask:
   13255           24 :                 error ("the last argument must be a 4-bit immediate");
   13256           24 :                 return const0_rtx;
   13257              : 
   13258           15 :               case CODE_FOR_sha1rnds4:
   13259           15 :               case CODE_FOR_sse4_1_blendpd:
   13260           15 :               case CODE_FOR_avx_vpermilv2df:
   13261           15 :               case CODE_FOR_avx_vpermilv2df_mask:
   13262           15 :               case CODE_FOR_xop_vpermil2v2df3:
   13263           15 :               case CODE_FOR_xop_vpermil2v4sf3:
   13264           15 :               case CODE_FOR_xop_vpermil2v4df3:
   13265           15 :               case CODE_FOR_xop_vpermil2v8sf3:
   13266           15 :               case CODE_FOR_avx512f_vinsertf32x4_mask:
   13267           15 :               case CODE_FOR_avx512f_vinserti32x4_mask:
   13268           15 :               case CODE_FOR_avx512f_vextractf32x4_mask:
   13269           15 :               case CODE_FOR_avx512f_vextracti32x4_mask:
   13270           15 :               case CODE_FOR_sse2_shufpd:
   13271           15 :               case CODE_FOR_sse2_shufpd_mask:
   13272           15 :               case CODE_FOR_avx512dq_shuf_f64x2_mask:
   13273           15 :               case CODE_FOR_avx512dq_shuf_i64x2_mask:
   13274           15 :               case CODE_FOR_avx512vl_shuf_i32x4_mask:
   13275           15 :               case CODE_FOR_avx512vl_shuf_f32x4_mask:
   13276           15 :                 error ("the last argument must be a 2-bit immediate");
   13277           15 :                 return const0_rtx;
   13278              : 
   13279           30 :               case CODE_FOR_avx_vextractf128v4df:
   13280           30 :               case CODE_FOR_avx_vextractf128v8sf:
   13281           30 :               case CODE_FOR_avx_vextractf128v8si:
   13282           30 :               case CODE_FOR_avx_vinsertf128v4df:
   13283           30 :               case CODE_FOR_avx_vinsertf128v8sf:
   13284           30 :               case CODE_FOR_avx_vinsertf128v8si:
   13285           30 :               case CODE_FOR_avx512f_vinsertf64x4_mask:
   13286           30 :               case CODE_FOR_avx512f_vinserti64x4_mask:
   13287           30 :               case CODE_FOR_avx512f_vextractf64x4_mask:
   13288           30 :               case CODE_FOR_avx512f_vextracti64x4_mask:
   13289           30 :               case CODE_FOR_avx512dq_vinsertf32x8_mask:
   13290           30 :               case CODE_FOR_avx512dq_vinserti32x8_mask:
   13291           30 :               case CODE_FOR_avx512vl_vinsertv4df:
   13292           30 :               case CODE_FOR_avx512vl_vinsertv4di:
   13293           30 :               case CODE_FOR_avx512vl_vinsertv8sf:
   13294           30 :               case CODE_FOR_avx512vl_vinsertv8si:
   13295           30 :                 error ("the last argument must be a 1-bit immediate");
   13296           30 :                 return const0_rtx;
   13297              : 
   13298           16 :               case CODE_FOR_avx_vmcmpv2df3:
   13299           16 :               case CODE_FOR_avx_vmcmpv4sf3:
   13300           16 :               case CODE_FOR_avx_cmpv2df3:
   13301           16 :               case CODE_FOR_avx_cmpv4sf3:
   13302           16 :                 if (CONST_INT_P (op) && IN_RANGE (INTVAL (op), 8, 31))
   13303              :                   {
   13304            4 :                     error ("'%s' needs isa option %s", d->name, "-mavx");
   13305            4 :                     return const0_rtx;
   13306              :                   }
   13307              :                 /* FALLTHRU */
   13308           18 :               case CODE_FOR_avx_cmpv4df3:
   13309           18 :               case CODE_FOR_avx_cmpv8sf3:
   13310           18 :               case CODE_FOR_avx512f_cmpv8df3_mask:
   13311           18 :               case CODE_FOR_avx512f_cmpv16sf3_mask:
   13312           18 :               case CODE_FOR_avx512f_vmcmpv2df3_mask:
   13313           18 :               case CODE_FOR_avx512f_vmcmpv4sf3_mask:
   13314           18 :               case CODE_FOR_avx512bw_cmpv32hf3_mask:
   13315           18 :               case CODE_FOR_avx512vl_cmpv16hf3_mask:
   13316           18 :               case CODE_FOR_avx512fp16_cmpv8hf3_mask:
   13317           18 :                 error ("the last argument must be a 5-bit immediate");
   13318           18 :                 return const0_rtx;
   13319              : 
   13320          132 :               default:
   13321          132 :                 switch (nargs_constant)
   13322              :                   {
   13323            8 :                   case 2:
   13324            8 :                     if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
   13325            8 :                         (!mask_pos && (nargs - i) == nargs_constant))
   13326              :                       {
   13327            4 :                         error ("the next to last argument must be an 8-bit immediate");
   13328            4 :                         break;
   13329              :                       }
   13330              :                     /* FALLTHRU */
   13331          128 :                   case 1:
   13332          128 :                     error ("the last argument must be an 8-bit immediate");
   13333          128 :                     break;
   13334            0 :                   default:
   13335            0 :                     gcc_unreachable ();
   13336              :                   }
   13337          132 :                 return const0_rtx;
   13338              :               }
   13339              :         }
   13340              :       else
   13341              :         {
   13342       179518 :           if (VECTOR_MODE_P (mode))
   13343       128929 :             op = safe_vector_operand (op, mode);
   13344              : 
   13345              :           /* If we aren't optimizing, only allow one memory operand to
   13346              :              be generated.  */
   13347       179518 :           if (memory_operand (op, mode))
   13348              :             {
   13349        29824 :               num_memory++;
   13350        29824 :               if (!optimize && num_memory > 1)
   13351        13586 :                 op = copy_to_mode_reg (mode, op);
   13352              :             }
   13353              : 
   13354       179518 :           if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   13355              :             {
   13356       177365 :               if (!match)
   13357        42567 :                 op = copy_to_mode_reg (mode, op);
   13358              :             }
   13359              :           else
   13360              :             {
   13361         2153 :               op = copy_to_reg (op);
   13362         2153 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   13363              :             }
   13364              :         }
   13365              : 
   13366       198457 :       xops[i] = op;
   13367              :     }
   13368              : 
   13369        59277 :   switch (nargs)
   13370              :     {
   13371         3393 :     case 1:
   13372         3393 :       pat = GEN_FCN (icode) (real_target, xops[0]);
   13373         3393 :       break;
   13374         5481 :     case 2:
   13375         5481 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
   13376         5481 :       break;
   13377        20621 :     case 3:
   13378        20621 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
   13379        20621 :       break;
   13380        27042 :     case 4:
   13381        27042 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13382        27042 :                              xops[2], xops[3]);
   13383        27042 :       break;
   13384         2740 :     case 5:
   13385         2740 :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13386         2740 :                              xops[2], xops[3], xops[4]);
   13387         2740 :       break;
   13388              :     case 6:
   13389              :       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   13390              :                              xops[2], xops[3], xops[4], xops[5]);
   13391              :       break;
   13392              :     default:
   13393              :       gcc_unreachable ();
   13394              :     }
   13395              : 
   13396        59277 :   if (! pat)
   13397              :     return 0;
   13398              : 
   13399        59277 :   emit_insn (pat);
   13400        59277 :   return target;
   13401              : }
   13402              : 
   13403              : /* Transform pattern of following layout:
   13404              :      (set A
   13405              :        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
   13406              :      )
   13407              :    into:
   13408              :      (set (A B)) */
   13409              : 
   13410              : static rtx
   13411         4931 : ix86_erase_embedded_rounding (rtx pat)
   13412              : {
   13413         4931 :   if (NONJUMP_INSN_P (pat))
   13414          685 :     pat = PATTERN (pat);
   13415              : 
   13416         4931 :   gcc_assert (GET_CODE (pat) == SET);
   13417         4931 :   rtx src = SET_SRC (pat);
   13418         4931 :   gcc_assert (XVECLEN (src, 0) == 2);
   13419         4931 :   rtx p0 = XVECEXP (src, 0, 0);
   13420         4931 :   gcc_assert (GET_CODE (src) == UNSPEC
   13421              :               && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
   13422         4931 :   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
   13423         4931 :   return res;
   13424              : }
   13425              : 
   13426              : /* Subroutine of ix86_expand_round_builtin to take care of comi insns
   13427              :    with rounding.  */
   13428              : static rtx
   13429          103 : ix86_expand_sse_comi_round (const struct builtin_description *d,
   13430              :                             tree exp, rtx target, bool comx_ok)
   13431              : {
   13432          103 :   rtx pat, set_dst;
   13433          103 :   tree arg0 = CALL_EXPR_ARG (exp, 0);
   13434          103 :   tree arg1 = CALL_EXPR_ARG (exp, 1);
   13435          103 :   tree arg2 = CALL_EXPR_ARG (exp, 2);
   13436          103 :   tree arg3 = CALL_EXPR_ARG (exp, 3);
   13437          103 :   rtx op0 = expand_normal (arg0);
   13438          103 :   rtx op1 = expand_normal (arg1);
   13439          103 :   rtx op2 = expand_normal (arg2);
   13440          103 :   rtx op3 = expand_normal (arg3);
   13441          103 :   enum insn_code icode = d->icode;
   13442          103 :   const struct insn_data_d *insn_p = &insn_data[icode];
   13443          103 :   machine_mode mode0 = insn_p->operand[0].mode;
   13444          103 :   machine_mode mode1 = insn_p->operand[1].mode;
   13445              : 
   13446              :   /* See avxintrin.h for values.  */
   13447          103 :   static const enum rtx_code comparisons[32] =
   13448              :     {
   13449              :       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
   13450              :       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
   13451              :       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
   13452              :       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
   13453              :     };
   13454          103 :   static const bool ordereds[32] =
   13455              :     {
   13456              :       true,  true,  true,  false, false, false, false, true,
   13457              :       false, false, false, true,  true,  true,  true,  false,
   13458              :       true,  true,  true,  false, false, false, false, true,
   13459              :       false, false, false, true,  true,  true,  true,  false
   13460              :     };
   13461          103 :   static const bool non_signalings[32] =
   13462              :     {
   13463              :       true,  false, false, true,  true,  false, false, true,
   13464              :       true,  false, false, true,  true,  false, false, true,
   13465              :       false, true,  true,  false, false, true,  true,  false,
   13466              :       false, true,  true,  false, false, true,  true,  false
   13467              :     };
   13468              : 
   13469          103 :   if (!CONST_INT_P (op2))
   13470              :     {
   13471            0 :       error ("the third argument must be comparison constant");
   13472            0 :       return const0_rtx;
   13473              :     }
   13474          103 :   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
   13475              :     {
   13476            0 :       error ("incorrect comparison mode");
   13477            0 :       return const0_rtx;
   13478              :     }
   13479              : 
   13480          103 :   if (!insn_p->operand[2].predicate (op3, SImode))
   13481              :     {
   13482            4 :       error ("incorrect rounding operand");
   13483            4 :       return const0_rtx;
   13484              :     }
   13485              : 
   13486           99 :   if (VECTOR_MODE_P (mode0))
   13487           99 :     op0 = safe_vector_operand (op0, mode0);
   13488           99 :   if (VECTOR_MODE_P (mode1))
   13489           99 :     op1 = safe_vector_operand (op1, mode1);
   13490              : 
   13491           99 :   enum rtx_code comparison = comparisons[INTVAL (op2)];
   13492           99 :   enum rtx_code orig_comp = comparison;
   13493           99 :   bool ordered = ordereds[INTVAL (op2)];
   13494           99 :   bool non_signaling = non_signalings[INTVAL (op2)];
   13495           99 :   rtx const_val = const0_rtx;
   13496              : 
   13497           99 :   bool check_unordered = false;
   13498           99 :   machine_mode mode = CCFPmode;
   13499           99 :   switch (comparison)
   13500              :     {
   13501            8 :     case ORDERED:
   13502            8 :       if (!ordered)
   13503              :         {
   13504            4 :           if (TARGET_AVX10_2 && comx_ok)
   13505              :             {
   13506              :               /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
   13507              :                  differently. So directly return true here.  */
   13508            0 :               target = gen_reg_rtx (SImode);
   13509            0 :               emit_move_insn (target, const1_rtx);
   13510            0 :               return target;
   13511              :             }
   13512              :           else
   13513              :             {
   13514              :               /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
   13515              :               if (!non_signaling)
   13516           99 :                 ordered = true;
   13517           99 :               mode = CCSmode;
   13518              :             }
   13519              :         }
   13520              :       else
   13521              :         {
   13522              :           /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
   13523              :           if (non_signaling)
   13524              :             ordered = false;
   13525              :           mode = CCPmode;
   13526              :         }
   13527              :       comparison = NE;
   13528              :       break;
   13529            8 :     case UNORDERED:
   13530            8 :       if (ordered)
   13531              :         {
   13532            4 :           if (TARGET_AVX10_2 && comx_ok)
   13533              :             {
   13534              :               /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
   13535              :                  differently. So directly return false here.  */
   13536            0 :               target = gen_reg_rtx (SImode);
   13537            0 :               emit_move_insn (target, const0_rtx);
   13538            0 :               return target;
   13539              :             }
   13540              :           else
   13541              :             {
   13542              :               /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
   13543              :               if (non_signaling)
   13544           99 :                 ordered = false;
   13545              :               mode = CCSmode;
   13546              :             }
   13547              :         }
   13548              :       else
   13549              :         {
   13550              :           /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
   13551              :           if (!non_signaling)
   13552           99 :             ordered = true;
   13553           99 :           mode = CCPmode;
   13554              :         }
   13555              :       comparison = EQ;
   13556              :       break;
   13557              : 
   13558           40 :     case LE:    /* -> GE  */
   13559           40 :     case LT:    /* -> GT  */
   13560           40 :     case UNGE:  /* -> UNLE  */
   13561           40 :     case UNGT:  /* -> UNLT  */
   13562           40 :       std::swap (op0, op1);
   13563           40 :       comparison = swap_condition (comparison);
   13564              :       /* FALLTHRU */
   13565           68 :     case GT:
   13566           68 :     case GE:
   13567           68 :     case UNEQ:
   13568           68 :     case UNLT:
   13569           68 :     case UNLE:
   13570           68 :     case LTGT:
   13571              :       /* These are supported by CCFPmode.  NB: Use ordered/signaling
   13572              :          COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
   13573              :          with NAN operands.  */
   13574           68 :       if (ordered == non_signaling)
   13575              :         ordered = !ordered;
   13576              :       break;
   13577              :       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
   13578              :          _CMP_EQ_OQ/_CMP_EQ_OS.
   13579              :          Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
   13580              :          of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN.  */
   13581            8 :     case EQ:
   13582            8 :       if (!TARGET_AVX10_2 || !comx_ok)
   13583            5 :         check_unordered = true;
   13584              :       mode = CCZmode;
   13585              :       break;
   13586            7 :     case NE:
   13587              :       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
   13588              :          _CMP_NEQ_UQ/_CMP_NEQ_US.
   13589              :          Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
   13590              :          of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN.  */
   13591            7 :       gcc_assert (!ordered);
   13592            7 :       if (!TARGET_AVX10_2 || !comx_ok)
   13593            4 :         check_unordered = true;
   13594            7 :       mode = CCZmode;
   13595            7 :       const_val = const1_rtx;
   13596            7 :       break;
   13597            0 :     default:
   13598            0 :       gcc_unreachable ();
   13599              :     }
   13600              : 
   13601           99 :   target = gen_reg_rtx (SImode);
   13602           99 :   emit_move_insn (target, const_val);
   13603           99 :   target = gen_rtx_SUBREG (QImode, target, 0);
   13604              : 
   13605           93 :   if ((optimize && !register_operand (op0, mode0))
   13606          192 :       || !insn_p->operand[0].predicate (op0, mode0))
   13607            6 :     op0 = copy_to_mode_reg (mode0, op0);
   13608           93 :   if ((optimize && !register_operand (op1, mode1))
   13609          192 :       || !insn_p->operand[1].predicate (op1, mode1))
   13610            6 :     op1 = copy_to_mode_reg (mode1, op1);
   13611              : 
   13612              :     /* Generate comx instead of comi when EQ/NE to avoid NAN checks.
   13613              :        Use orig_comp to exclude ORDERED/UNORDERED cases.  */
   13614           99 :   if ((orig_comp == EQ || orig_comp == NE)
   13615           15 :       && TARGET_AVX10_2 && comx_ok)
   13616              :     {
   13617            6 :       switch (icode)
   13618              :         {
   13619              :         case CODE_FOR_avx512fp16_comi_round:
   13620           99 :           icode = CODE_FOR_avx10_2_comxhf_round;
   13621              :           break;
   13622            4 :         case CODE_FOR_sse_comi_round:
   13623            4 :           icode = CODE_FOR_avx10_2_comxsf_round;
   13624            4 :           break;
   13625            2 :         case CODE_FOR_sse2_comi_round:
   13626            2 :           icode = CODE_FOR_avx10_2_comxdf_round;
   13627            2 :           break;
   13628              : 
   13629              :         default:
   13630              :           break;
   13631              :         }
   13632              :     }
   13633              : 
   13634              :   /* Generate comi instead of comx when UNEQ/LTGT to avoid NAN checks.  */
   13635           99 :   if ((comparison == UNEQ || comparison == LTGT)
   13636            8 :        && TARGET_AVX10_2 && comx_ok)
   13637              :     {
   13638            0 :       switch (icode)
   13639              :         {
   13640              :         case CODE_FOR_avx10_2_comxhf_round:
   13641           99 :           icode = CODE_FOR_avx512fp16_comi_round;
   13642              :           break;
   13643            0 :         case CODE_FOR_avx10_2_comxsf_round:
   13644            0 :           icode = CODE_FOR_sse_comi_round;
   13645            0 :           break;
   13646            0 :         case CODE_FOR_avx10_2_comxdf_round:
   13647            0 :           icode = CODE_FOR_sse2_comi_round;
   13648            0 :           break;
   13649              : 
   13650              :         default:
   13651              :           break;
   13652              :         }
   13653              :     }
   13654              : 
   13655              :   /*
   13656              :      1. COMI/VCOMX: ordered and signaling.
   13657              :      2. UCOMI/VUCOMX: unordered and non-signaling.
   13658              :    */
   13659           99 :   if (non_signaling)
   13660           38 :     switch (icode)
   13661              :       {
   13662              :       case CODE_FOR_sse_comi_round:
   13663              :         icode = CODE_FOR_sse_ucomi_round;
   13664              :         break;
   13665           17 :       case CODE_FOR_sse2_comi_round:
   13666           17 :         icode = CODE_FOR_sse2_ucomi_round;
   13667           17 :         break;
   13668            0 :       case CODE_FOR_avx512fp16_comi_round:
   13669            0 :         icode = CODE_FOR_avx512fp16_ucomi_round;
   13670            0 :         break;
   13671            3 :       case CODE_FOR_avx10_2_comxsf_round:
   13672            3 :         icode = CODE_FOR_avx10_2_ucomxsf_round;
   13673            3 :         break;
   13674            0 :       case CODE_FOR_avx10_2_comxhf_round:
   13675            0 :         icode = CODE_FOR_avx10_2_ucomxhf_round;
   13676            0 :         break;
   13677            1 :       case CODE_FOR_avx10_2_comxdf_round:
   13678            1 :         icode = CODE_FOR_avx10_2_ucomxdf_round;
   13679            1 :         break;
   13680            0 :       default:
   13681            0 :         gcc_unreachable ();
   13682              :       }
   13683              : 
   13684           99 :   pat = GEN_FCN (icode) (op0, op1, op3);
   13685           99 :   if (! pat)
   13686              :     return 0;
   13687              : 
   13688              :   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
   13689           99 :   if (INTVAL (op3) == NO_ROUND)
   13690              :     {
   13691            1 :       pat = ix86_erase_embedded_rounding (pat);
   13692            1 :       if (! pat)
   13693              :         return 0;
   13694              : 
   13695            1 :       set_dst = SET_DEST (pat);
   13696              :     }
   13697              :   else
   13698              :     {
   13699           98 :       gcc_assert (GET_CODE (pat) == SET);
   13700           98 :       set_dst = SET_DEST (pat);
   13701              :     }
   13702              : 
   13703           99 :   emit_insn (pat);
   13704              : 
   13705           99 :   return ix86_ssecom_setcc (comparison, check_unordered, mode,
   13706           99 :                             set_dst, target);
   13707              : }
   13708              : 
   13709              : static rtx
   13710        15554 : ix86_expand_round_builtin (const struct builtin_description *d,
   13711              :                            tree exp, rtx target)
   13712              : {
   13713        15554 :   rtx pat;
   13714        15554 :   unsigned int i, nargs;
   13715        15554 :   rtx xops[6];
   13716        15554 :   enum insn_code icode = d->icode;
   13717        15554 :   const struct insn_data_d *insn_p = &insn_data[icode];
   13718        15554 :   machine_mode tmode = insn_p->operand[0].mode;
   13719        15554 :   unsigned int nargs_constant = 0;
   13720        15554 :   unsigned int redundant_embed_rnd = 0;
   13721              : 
   13722        15554 :   switch ((enum ix86_builtin_func_type) d->flag)
   13723              :     {
   13724              :     case UINT64_FTYPE_V2DF_INT:
   13725              :     case UINT64_FTYPE_V4SF_INT:
   13726              :     case UINT64_FTYPE_V8HF_INT:
   13727              :     case UINT_FTYPE_V2DF_INT:
   13728              :     case UINT_FTYPE_V4SF_INT:
   13729              :     case UINT_FTYPE_V8HF_INT:
   13730              :     case INT64_FTYPE_V2DF_INT:
   13731              :     case INT64_FTYPE_V4SF_INT:
   13732              :     case INT64_FTYPE_V8HF_INT:
   13733              :     case INT_FTYPE_V2DF_INT:
   13734              :     case INT_FTYPE_V4SF_INT:
   13735              :     case INT_FTYPE_V8HF_INT:
   13736              :       nargs = 2;
   13737              :       break;
   13738          642 :     case V32HF_FTYPE_V32HF_V32HF_INT:
   13739          642 :     case V8HF_FTYPE_V8HF_V8HF_INT:
   13740          642 :     case V8HF_FTYPE_V8HF_INT_INT:
   13741          642 :     case V8HF_FTYPE_V8HF_UINT_INT:
   13742          642 :     case V8HF_FTYPE_V8HF_INT64_INT:
   13743          642 :     case V8HF_FTYPE_V8HF_UINT64_INT:
   13744          642 :     case V4SF_FTYPE_V4SF_UINT_INT:
   13745          642 :     case V4SF_FTYPE_V4SF_UINT64_INT:
   13746          642 :     case V2DF_FTYPE_V2DF_UINT64_INT:
   13747          642 :     case V4SF_FTYPE_V4SF_INT_INT:
   13748          642 :     case V4SF_FTYPE_V4SF_INT64_INT:
   13749          642 :     case V2DF_FTYPE_V2DF_INT64_INT:
   13750          642 :     case V4SF_FTYPE_V4SF_V4SF_INT:
   13751          642 :     case V2DF_FTYPE_V2DF_V2DF_INT:
   13752          642 :     case V4SF_FTYPE_V4SF_V2DF_INT:
   13753          642 :     case V2DF_FTYPE_V2DF_V4SF_INT:
   13754          642 :       nargs = 3;
   13755          642 :       break;
   13756         4554 :     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
   13757         4554 :     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
   13758         4554 :     case V32HI_FTYPE_V32HF_V32HI_USI_INT:
   13759         4554 :     case V32HI_FTYPE_V32BF_V32HI_USI_INT:
   13760         4554 :     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
   13761         4554 :     case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
   13762         4554 :     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
   13763         4554 :     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
   13764         4554 :     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
   13765         4554 :     case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
   13766         4554 :     case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
   13767         4554 :     case V32HF_FTYPE_V32HI_V32HF_USI_INT:
   13768         4554 :     case V32HF_FTYPE_V32HF_V32HF_USI_INT:
   13769         4554 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
   13770         4554 :     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
   13771         4554 :     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
   13772         4554 :     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
   13773         4554 :     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
   13774         4554 :     case V16SI_FTYPE_V16SF_V16SI_UHI_INT:
   13775         4554 :     case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
   13776         4554 :     case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
   13777         4554 :     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
   13778         4554 :     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
   13779         4554 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
   13780         4554 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
   13781         4554 :     case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
   13782         4554 :     case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
   13783         4554 :     case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
   13784         4554 :     case V16HI_FTYPE_V16BF_V16HI_UHI_INT:
   13785         4554 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
   13786         4554 :       nargs = 4;
   13787         4554 :       break;
   13788          171 :     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
   13789          171 :     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
   13790          171 :       nargs_constant = 2;
   13791          171 :       nargs = 4;
   13792          171 :       break;
   13793          103 :     case INT_FTYPE_V4SF_V4SF_INT_INT:
   13794          103 :     case INT_FTYPE_V2DF_V2DF_INT_INT:
   13795          103 :       return ix86_expand_sse_comi_round (d, exp, target, true);
   13796         6220 :     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
   13797         6220 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
   13798         6220 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
   13799         6220 :     case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
   13800         6220 :     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
   13801         6220 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
   13802         6220 :     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
   13803         6220 :     case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
   13804         6220 :     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
   13805         6220 :     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
   13806         6220 :     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
   13807         6220 :     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
   13808         6220 :     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
   13809         6220 :     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
   13810         6220 :     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
   13811         6220 :     case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
   13812         6220 :     case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
   13813         6220 :     case V32HF_FTYPE_V16SF_V16SF_V32HF_USI_INT:
   13814         6220 :       nargs = 5;
   13815         6220 :       break;
   13816          635 :     case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
   13817          635 :     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
   13818          635 :     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
   13819          635 :     case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
   13820          635 :     case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
   13821          635 :       nargs_constant = 4;
   13822          635 :       nargs = 5;
   13823          635 :       break;
   13824         1181 :     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
   13825         1181 :     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
   13826         1181 :     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
   13827         1181 :     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
   13828         1181 :     case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
   13829         1181 :     case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
   13830         1181 :       nargs_constant = 3;
   13831         1181 :       nargs = 5;
   13832         1181 :       break;
   13833         1071 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
   13834         1071 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
   13835         1071 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
   13836         1071 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
   13837         1071 :     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
   13838         1071 :     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
   13839         1071 :     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
   13840         1071 :     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI_INT:
   13841         1071 :     case V32HF_FTYPE_V32HF_V32HF_INT_V32HF_USI_INT:
   13842         1071 :     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI_INT:
   13843         1071 :       nargs = 6;
   13844         1071 :       nargs_constant = 4;
   13845         1071 :       break;
   13846          252 :     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
   13847          252 :     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
   13848          252 :     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
   13849          252 :     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
   13850          252 :       nargs = 6;
   13851          252 :       nargs_constant = 3;
   13852          252 :       break;
   13853            0 :     default:
   13854            0 :       gcc_unreachable ();
   13855              :     }
   13856        14726 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   13857              : 
   13858        15451 :   if (optimize
   13859         4265 :       || target == 0
   13860         4265 :       || GET_MODE (target) != tmode
   13861        19716 :       || !insn_p->operand[0].predicate (target, tmode))
   13862        11186 :     target = gen_reg_rtx (tmode);
   13863              : 
   13864        85194 :   for (i = 0; i < nargs; i++)
   13865              :     {
   13866        70298 :       tree arg = CALL_EXPR_ARG (exp, i);
   13867        70298 :       rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
   13868        70298 :       machine_mode mode = insn_p->operand[i + 1].mode;
   13869        70298 :       bool match = insn_p->operand[i + 1].predicate (op, mode);
   13870              : 
   13871        70298 :       if (i == nargs - nargs_constant)
   13872              :         {
   13873         3310 :           if (!match)
   13874              :             {
   13875           40 :               switch (icode)
   13876              :                 {
   13877           12 :                 case CODE_FOR_avx512f_getmantv8df_mask_round:
   13878           12 :                 case CODE_FOR_avx512f_getmantv16sf_mask_round:
   13879           12 :                 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
   13880           12 :                 case CODE_FOR_avx512f_vgetmantv2df_round:
   13881           12 :                 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
   13882           12 :                 case CODE_FOR_avx512f_vgetmantv4sf_round:
   13883           12 :                 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
   13884           12 :                 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
   13885           12 :                   error ("the immediate argument must be a 4-bit immediate");
   13886           12 :                   return const0_rtx;
   13887            8 :                 case CODE_FOR_avx512f_cmpv8df3_mask_round:
   13888            8 :                 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
   13889            8 :                 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
   13890            8 :                 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
   13891            8 :                 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
   13892            8 :                 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
   13893            8 :                   error ("the immediate argument must be a 5-bit immediate");
   13894            8 :                   return const0_rtx;
   13895           20 :                 default:
   13896           20 :                   error ("the immediate argument must be an 8-bit immediate");
   13897           20 :                   return const0_rtx;
   13898              :                 }
   13899              :             }
   13900              :         }
   13901        66988 :       else if (i == nargs-1)
   13902              :         {
   13903        15411 :           if (!insn_p->operand[nargs].predicate (op, SImode))
   13904              :             {
   13905          515 :               error ("incorrect rounding operand");
   13906          515 :               return const0_rtx;
   13907              :             }
   13908              : 
   13909              :           /* If there is no rounding use normal version of the pattern.  */
   13910        14896 :           if (INTVAL (op) == NO_ROUND)
   13911              :             {
   13912              :               /* Skip erasing embedded rounding for below expanders who
   13913              :                  generates multiple insns.  In ix86_erase_embedded_rounding
   13914              :                  the pattern will be transformed to a single set, and emit_insn
   13915              :                  appends the set instead of insert it to chain.  So the insns
   13916              :                  emitted inside define_expander would be ignored.  */
   13917         4962 :               switch (icode)
   13918              :                 {
   13919              :                 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
   13920              :                 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
   13921              :                 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
   13922              :                 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
   13923              :                 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
   13924              :                 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
   13925              :                   redundant_embed_rnd = 0;
   13926              :                   break;
   13927         4930 :                 default:
   13928         4930 :                   redundant_embed_rnd = 1;
   13929         4930 :                   break;
   13930              :                 }
   13931              :             }
   13932              :         }
   13933              :       else
   13934              :         {
   13935        51577 :           if (VECTOR_MODE_P (mode))
   13936        37673 :             op = safe_vector_operand (op, mode);
   13937              : 
   13938        51577 :           op = fixup_modeless_constant (op, mode);
   13939              : 
   13940        51577 :           if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   13941              :             {
   13942        51577 :               if (optimize || !match)
   13943        45249 :                 op = copy_to_mode_reg (mode, op);
   13944              :             }
   13945              :           else
   13946              :             {
   13947            0 :               op = copy_to_reg (op);
   13948            0 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   13949              :             }
   13950              :         }
   13951              : 
   13952        69743 :       xops[i] = op;
   13953              :     }
   13954              : 
   13955        14896 :   switch (nargs)
   13956              :     {
   13957              :     case 1:
   13958              :       pat = GEN_FCN (icode) (target, xops[0]);
   13959              :       break;
   13960          692 :     case 2:
   13961          692 :       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   13962          692 :       break;
   13963          598 :     case 3:
   13964          598 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   13965          598 :       break;
   13966         4601 :     case 4:
   13967         4601 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   13968         4601 :                              xops[2], xops[3]);
   13969         4601 :       break;
   13970         7732 :     case 5:
   13971         7732 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   13972         7732 :                              xops[2], xops[3], xops[4]);
   13973         7732 :       break;
   13974         1273 :     case 6:
   13975         1273 :       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   13976         1273 :                              xops[2], xops[3], xops[4], xops[5]);
   13977         1273 :       break;
   13978              :     default:
   13979              :       gcc_unreachable ();
   13980              :     }
   13981              : 
   13982        14896 :   if (!pat)
   13983              :     return 0;
   13984              : 
   13985        14896 :   if (redundant_embed_rnd)
   13986         4930 :     pat = ix86_erase_embedded_rounding (pat);
   13987              : 
   13988        14896 :   emit_insn (pat);
   13989        14896 :   return target;
   13990              : }
   13991              : 
   13992              : /* Subroutine of ix86_expand_builtin to take care of special insns
   13993              :    with variable number of operands.  */
   13994              : 
   13995              : static rtx
   13996        27174 : ix86_expand_special_args_builtin (const struct builtin_description *d,
   13997              :                                   tree exp, rtx target)
   13998              : {
   13999        27174 :   tree arg;
   14000        27174 :   rtx pat, op;
   14001        27174 :   unsigned int i, nargs, arg_adjust, memory;
   14002        27174 :   unsigned int constant = 100;
   14003        27174 :   bool aligned_mem = false;
   14004        27174 :   rtx xops[4];
   14005        27174 :   enum insn_code icode = d->icode;
   14006        27174 :   const struct insn_data_d *insn_p = &insn_data[icode];
   14007        27174 :   machine_mode tmode = insn_p->operand[0].mode;
   14008        27174 :   enum { load, store } klass;
   14009              : 
   14010        27174 :   switch ((enum ix86_builtin_func_type) d->flag)
   14011              :     {
   14012        15366 :     case VOID_FTYPE_VOID:
   14013        15366 :       emit_insn (GEN_FCN (icode) (target));
   14014        15366 :       return 0;
   14015              :     case VOID_FTYPE_UINT64:
   14016              :     case VOID_FTYPE_UNSIGNED:
   14017              :       nargs = 0;
   14018              :       klass = store;
   14019              :       memory = 0;
   14020              :       break;
   14021              : 
   14022         7581 :     case INT_FTYPE_VOID:
   14023         7581 :     case USHORT_FTYPE_VOID:
   14024         7581 :     case UINT64_FTYPE_VOID:
   14025         7581 :     case UINT_FTYPE_VOID:
   14026         7581 :     case UINT8_FTYPE_VOID:
   14027         7581 :     case UNSIGNED_FTYPE_VOID:
   14028         7581 :       nargs = 0;
   14029         7581 :       klass = load;
   14030         7581 :       memory = 0;
   14031         7581 :       break;
   14032          358 :     case CHAR_FTYPE_PCCHAR:
   14033          358 :     case SHORT_FTYPE_PCSHORT:
   14034          358 :     case INT_FTYPE_PCINT:
   14035          358 :     case INT64_FTYPE_PCINT64:
   14036          358 :     case UINT64_FTYPE_PUNSIGNED:
   14037          358 :     case V2DI_FTYPE_PV2DI:
   14038          358 :     case V4DI_FTYPE_PV4DI:
   14039          358 :     case V32QI_FTYPE_PCCHAR:
   14040          358 :     case V16QI_FTYPE_PCCHAR:
   14041          358 :     case V8SF_FTYPE_PCV4SF:
   14042          358 :     case V8SF_FTYPE_PCFLOAT:
   14043          358 :     case V4SF_FTYPE_PCFLOAT:
   14044          358 :     case V4SF_FTYPE_PCFLOAT16:
   14045          358 :     case V4SF_FTYPE_PCBFLOAT16:
   14046          358 :     case V4SF_FTYPE_PCV8BF:
   14047          358 :     case V4SF_FTYPE_PCV8HF:
   14048          358 :     case V8SF_FTYPE_PCFLOAT16:
   14049          358 :     case V8SF_FTYPE_PCBFLOAT16:
   14050          358 :     case V8SF_FTYPE_PCV16HF:
   14051          358 :     case V8SF_FTYPE_PCV16BF:
   14052          358 :     case V4DF_FTYPE_PCV2DF:
   14053          358 :     case V4DF_FTYPE_PCDOUBLE:
   14054          358 :     case V2DF_FTYPE_PCDOUBLE:
   14055          358 :     case VOID_FTYPE_PVOID:
   14056          358 :     case V8DI_FTYPE_PV8DI:
   14057          358 :       nargs = 1;
   14058          358 :       klass = load;
   14059          358 :       memory = 0;
   14060          358 :       switch (icode)
   14061              :         {
   14062              :         case CODE_FOR_sse4_1_movntdqa:
   14063              :         case CODE_FOR_avx2_movntdqa:
   14064              :         case CODE_FOR_avx512f_movntdqa:
   14065              :           aligned_mem = true;
   14066              :           break;
   14067              :         default:
   14068              :           break;
   14069              :         }
   14070              :       break;
   14071          371 :     case VOID_FTYPE_PV2SF_V4SF:
   14072          371 :     case VOID_FTYPE_PV8DI_V8DI:
   14073          371 :     case VOID_FTYPE_PV4DI_V4DI:
   14074          371 :     case VOID_FTYPE_PV2DI_V2DI:
   14075          371 :     case VOID_FTYPE_PCHAR_V32QI:
   14076          371 :     case VOID_FTYPE_PCHAR_V16QI:
   14077          371 :     case VOID_FTYPE_PFLOAT_V16SF:
   14078          371 :     case VOID_FTYPE_PFLOAT_V8SF:
   14079          371 :     case VOID_FTYPE_PFLOAT_V4SF:
   14080          371 :     case VOID_FTYPE_PDOUBLE_V8DF:
   14081          371 :     case VOID_FTYPE_PDOUBLE_V4DF:
   14082          371 :     case VOID_FTYPE_PDOUBLE_V2DF:
   14083          371 :     case VOID_FTYPE_PLONGLONG_LONGLONG:
   14084          371 :     case VOID_FTYPE_PULONGLONG_ULONGLONG:
   14085          371 :     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
   14086          371 :     case VOID_FTYPE_PINT_INT:
   14087          371 :       nargs = 1;
   14088          371 :       klass = store;
   14089              :       /* Reserve memory operand for target.  */
   14090          371 :       memory = ARRAY_SIZE (xops);
   14091          371 :       switch (icode)
   14092              :         {
   14093              :         /* These builtins and instructions require the memory
   14094              :            to be properly aligned.  */
   14095              :         case CODE_FOR_avx_movntv4di:
   14096              :         case CODE_FOR_sse2_movntv2di:
   14097              :         case CODE_FOR_avx_movntv8sf:
   14098              :         case CODE_FOR_sse_movntv4sf:
   14099              :         case CODE_FOR_sse4a_vmmovntv4sf:
   14100              :         case CODE_FOR_avx_movntv4df:
   14101              :         case CODE_FOR_sse2_movntv2df:
   14102              :         case CODE_FOR_sse4a_vmmovntv2df:
   14103              :         case CODE_FOR_sse2_movntidi:
   14104              :         case CODE_FOR_sse_movntq:
   14105              :         case CODE_FOR_sse2_movntisi:
   14106              :         case CODE_FOR_avx512f_movntv16sf:
   14107              :         case CODE_FOR_avx512f_movntv8df:
   14108              :         case CODE_FOR_avx512f_movntv8di:
   14109              :           aligned_mem = true;
   14110              :           break;
   14111              :         default:
   14112              :           break;
   14113              :         }
   14114              :       break;
   14115            0 :     case VOID_FTYPE_PVOID_PCVOID:
   14116            0 :         nargs = 1;
   14117            0 :         klass = store;
   14118            0 :         memory = 0;
   14119              : 
   14120            0 :         break;
   14121           26 :     case V4SF_FTYPE_V4SF_PCV2SF:
   14122           26 :     case V2DF_FTYPE_V2DF_PCDOUBLE:
   14123           26 :       nargs = 2;
   14124           26 :       klass = load;
   14125           26 :       memory = 1;
   14126           26 :       break;
   14127           93 :     case V8SF_FTYPE_PCV8SF_V8SI:
   14128           93 :     case V4DF_FTYPE_PCV4DF_V4DI:
   14129           93 :     case V4SF_FTYPE_PCV4SF_V4SI:
   14130           93 :     case V2DF_FTYPE_PCV2DF_V2DI:
   14131           93 :     case V8SI_FTYPE_PCV8SI_V8SI:
   14132           93 :     case V4DI_FTYPE_PCV4DI_V4DI:
   14133           93 :     case V4SI_FTYPE_PCV4SI_V4SI:
   14134           93 :     case V2DI_FTYPE_PCV2DI_V2DI:
   14135           93 :     case VOID_FTYPE_INT_INT64:
   14136           93 :       nargs = 2;
   14137           93 :       klass = load;
   14138           93 :       memory = 0;
   14139           93 :       break;
   14140          360 :     case VOID_FTYPE_PV8DF_V8DF_UQI:
   14141          360 :     case VOID_FTYPE_PV4DF_V4DF_UQI:
   14142          360 :     case VOID_FTYPE_PV2DF_V2DF_UQI:
   14143          360 :     case VOID_FTYPE_PV16SF_V16SF_UHI:
   14144          360 :     case VOID_FTYPE_PV8SF_V8SF_UQI:
   14145          360 :     case VOID_FTYPE_PV4SF_V4SF_UQI:
   14146          360 :     case VOID_FTYPE_PV8DI_V8DI_UQI:
   14147          360 :     case VOID_FTYPE_PV4DI_V4DI_UQI:
   14148          360 :     case VOID_FTYPE_PV2DI_V2DI_UQI:
   14149          360 :     case VOID_FTYPE_PV16SI_V16SI_UHI:
   14150          360 :     case VOID_FTYPE_PV8SI_V8SI_UQI:
   14151          360 :     case VOID_FTYPE_PV4SI_V4SI_UQI:
   14152          360 :     case VOID_FTYPE_PV64QI_V64QI_UDI:
   14153          360 :     case VOID_FTYPE_PV32HI_V32HI_USI:
   14154          360 :     case VOID_FTYPE_PV32QI_V32QI_USI:
   14155          360 :     case VOID_FTYPE_PV16QI_V16QI_UHI:
   14156          360 :     case VOID_FTYPE_PV16HI_V16HI_UHI:
   14157          360 :     case VOID_FTYPE_PV8HI_V8HI_UQI:
   14158          360 :       switch (icode)
   14159              :         {
   14160              :         /* These builtins and instructions require the memory
   14161              :            to be properly aligned.  */
   14162              :         case CODE_FOR_avx512f_storev16sf_mask:
   14163              :         case CODE_FOR_avx512f_storev16si_mask:
   14164              :         case CODE_FOR_avx512f_storev8df_mask:
   14165              :         case CODE_FOR_avx512f_storev8di_mask:
   14166              :         case CODE_FOR_avx512vl_storev8sf_mask:
   14167              :         case CODE_FOR_avx512vl_storev8si_mask:
   14168              :         case CODE_FOR_avx512vl_storev4df_mask:
   14169              :         case CODE_FOR_avx512vl_storev4di_mask:
   14170              :         case CODE_FOR_avx512vl_storev4sf_mask:
   14171              :         case CODE_FOR_avx512vl_storev4si_mask:
   14172              :         case CODE_FOR_avx512vl_storev2df_mask:
   14173              :         case CODE_FOR_avx512vl_storev2di_mask:
   14174        11808 :           aligned_mem = true;
   14175              :           break;
   14176              :         default:
   14177              :           break;
   14178              :         }
   14179              :       /* FALLTHRU */
   14180              :     case VOID_FTYPE_PV8SF_V8SI_V8SF:
   14181              :     case VOID_FTYPE_PV4DF_V4DI_V4DF:
   14182              :     case VOID_FTYPE_PV4SF_V4SI_V4SF:
   14183              :     case VOID_FTYPE_PV2DF_V2DI_V2DF:
   14184              :     case VOID_FTYPE_PV8SI_V8SI_V8SI:
   14185              :     case VOID_FTYPE_PV4DI_V4DI_V4DI:
   14186              :     case VOID_FTYPE_PV4SI_V4SI_V4SI:
   14187              :     case VOID_FTYPE_PV2DI_V2DI_V2DI:
   14188              :     case VOID_FTYPE_PV8SI_V8DI_UQI:
   14189              :     case VOID_FTYPE_PV8HI_V8DI_UQI:
   14190              :     case VOID_FTYPE_PV16HI_V16SI_UHI:
   14191              :     case VOID_FTYPE_PUDI_V8DI_UQI:
   14192              :     case VOID_FTYPE_PV16QI_V16SI_UHI:
   14193              :     case VOID_FTYPE_PV4SI_V4DI_UQI:
   14194              :     case VOID_FTYPE_PUDI_V2DI_UQI:
   14195              :     case VOID_FTYPE_PUDI_V4DI_UQI:
   14196              :     case VOID_FTYPE_PUSI_V2DI_UQI:
   14197              :     case VOID_FTYPE_PV8HI_V8SI_UQI:
   14198              :     case VOID_FTYPE_PUDI_V4SI_UQI:
   14199              :     case VOID_FTYPE_PUSI_V4DI_UQI:
   14200              :     case VOID_FTYPE_PUHI_V2DI_UQI:
   14201              :     case VOID_FTYPE_PUDI_V8SI_UQI:
   14202              :     case VOID_FTYPE_PUSI_V4SI_UQI:
   14203              :     case VOID_FTYPE_PCHAR_V64QI_UDI:
   14204              :     case VOID_FTYPE_PCHAR_V32QI_USI:
   14205              :     case VOID_FTYPE_PCHAR_V16QI_UHI:
   14206              :     case VOID_FTYPE_PSHORT_V32HI_USI:
   14207              :     case VOID_FTYPE_PSHORT_V16HI_UHI:
   14208              :     case VOID_FTYPE_PSHORT_V8HI_UQI:
   14209              :     case VOID_FTYPE_PINT_V16SI_UHI:
   14210              :     case VOID_FTYPE_PINT_V8SI_UQI:
   14211              :     case VOID_FTYPE_PINT_V4SI_UQI:
   14212              :     case VOID_FTYPE_PINT64_V8DI_UQI:
   14213              :     case VOID_FTYPE_PINT64_V4DI_UQI:
   14214              :     case VOID_FTYPE_PINT64_V2DI_UQI:
   14215              :     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
   14216              :     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
   14217              :     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
   14218              :     case VOID_FTYPE_PFLOAT_V16SF_UHI:
   14219              :     case VOID_FTYPE_PFLOAT_V8SF_UQI:
   14220              :     case VOID_FTYPE_PFLOAT_V4SF_UQI:
   14221              :     case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
   14222              :     case VOID_FTYPE_PV32QI_V32HI_USI:
   14223              :     case VOID_FTYPE_PV16QI_V16HI_UHI:
   14224              :     case VOID_FTYPE_PUDI_V8HI_UQI:
   14225              :       nargs = 2;
   14226              :       klass = store;
   14227              :       /* Reserve memory operand for target.  */
   14228              :       memory = ARRAY_SIZE (xops);
   14229              :       break;
   14230         1243 :     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
   14231         1243 :     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
   14232         1243 :     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
   14233         1243 :     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
   14234         1243 :     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
   14235         1243 :     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
   14236         1243 :     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
   14237         1243 :     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
   14238         1243 :     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
   14239         1243 :     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
   14240         1243 :     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
   14241         1243 :     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
   14242         1243 :     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
   14243         1243 :     case V32HI_FTYPE_PCV32HI_V32HI_USI:
   14244         1243 :     case V32QI_FTYPE_PCV32QI_V32QI_USI:
   14245         1243 :     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
   14246         1243 :     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
   14247         1243 :     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
   14248         1243 :       switch (icode)
   14249              :         {
   14250              :         /* These builtins and instructions require the memory
   14251              :            to be properly aligned.  */
   14252              :         case CODE_FOR_avx512f_loadv16sf_mask:
   14253              :         case CODE_FOR_avx512f_loadv16si_mask:
   14254              :         case CODE_FOR_avx512f_loadv8df_mask:
   14255              :         case CODE_FOR_avx512f_loadv8di_mask:
   14256              :         case CODE_FOR_avx512vl_loadv8sf_mask:
   14257              :         case CODE_FOR_avx512vl_loadv8si_mask:
   14258              :         case CODE_FOR_avx512vl_loadv4df_mask:
   14259              :         case CODE_FOR_avx512vl_loadv4di_mask:
   14260              :         case CODE_FOR_avx512vl_loadv4sf_mask:
   14261              :         case CODE_FOR_avx512vl_loadv4si_mask:
   14262              :         case CODE_FOR_avx512vl_loadv2df_mask:
   14263              :         case CODE_FOR_avx512vl_loadv2di_mask:
   14264              :         case CODE_FOR_avx512bw_loadv64qi_mask:
   14265              :         case CODE_FOR_avx512vl_loadv32qi_mask:
   14266              :         case CODE_FOR_avx512vl_loadv16qi_mask:
   14267              :         case CODE_FOR_avx512bw_loadv32hi_mask:
   14268              :         case CODE_FOR_avx512vl_loadv16hi_mask:
   14269              :         case CODE_FOR_avx512vl_loadv8hi_mask:
   14270        11808 :           aligned_mem = true;
   14271              :           break;
   14272              :         default:
   14273              :           break;
   14274              :         }
   14275              :       /* FALLTHRU */
   14276              :     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
   14277              :     case V32QI_FTYPE_PCCHAR_V32QI_USI:
   14278              :     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
   14279              :     case V32HI_FTYPE_PCSHORT_V32HI_USI:
   14280              :     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
   14281              :     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
   14282              :     case V16SI_FTYPE_PCINT_V16SI_UHI:
   14283              :     case V8SI_FTYPE_PCINT_V8SI_UQI:
   14284              :     case V4SI_FTYPE_PCINT_V4SI_UQI:
   14285              :     case V8DI_FTYPE_PCINT64_V8DI_UQI:
   14286              :     case V4DI_FTYPE_PCINT64_V4DI_UQI:
   14287              :     case V2DI_FTYPE_PCINT64_V2DI_UQI:
   14288              :     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
   14289              :     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
   14290              :     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
   14291              :     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
   14292              :     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
   14293              :     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
   14294              :     case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
   14295              :       nargs = 3;
   14296              :       klass = load;
   14297              :       memory = 0;
   14298              :       break;
   14299          105 :     case INT_FTYPE_PINT_INT_INT_INT:
   14300          105 :     case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
   14301          105 :       nargs = 4;
   14302          105 :       klass = load;
   14303          105 :       memory = 0;
   14304          105 :       constant = 3;
   14305          105 :       break;
   14306            0 :     default:
   14307            0 :       gcc_unreachable ();
   14308              :     }
   14309              : 
   14310         8338 :   gcc_assert (nargs <= ARRAY_SIZE (xops));
   14311              : 
   14312        11808 :   if (klass == store)
   14313              :     {
   14314         1875 :       arg = CALL_EXPR_ARG (exp, 0);
   14315         1875 :       op = expand_normal (arg);
   14316         1875 :       gcc_assert (target == 0);
   14317         1875 :       if (memory)
   14318              :         {
   14319         1715 :           op = ix86_zero_extend_to_Pmode (op);
   14320         1715 :           target = gen_rtx_MEM (tmode, op);
   14321              :           /* target at this point has just BITS_PER_UNIT MEM_ALIGN
   14322              :              on it.  Try to improve it using get_pointer_alignment,
   14323              :              and if the special builtin is one that requires strict
   14324              :              mode alignment, also from it's GET_MODE_ALIGNMENT.
   14325              :              Failure to do so could lead to ix86_legitimate_combined_insn
   14326              :              rejecting all changes to such insns.  */
   14327         1715 :           unsigned int align = get_pointer_alignment (arg);
   14328         1715 :           if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
   14329          275 :             align = GET_MODE_ALIGNMENT (tmode);
   14330         3430 :           if (MEM_ALIGN (target) < align)
   14331          422 :             set_mem_align (target, align);
   14332              :         }
   14333              :       else
   14334          160 :         target = force_reg (tmode, op);
   14335              :       arg_adjust = 1;
   14336              :     }
   14337              :   else
   14338              :     {
   14339         9933 :       arg_adjust = 0;
   14340         9933 :       if (optimize
   14341         2918 :           || target == 0
   14342         2918 :           || !register_operand (target, tmode)
   14343        12840 :           || GET_MODE (target) != tmode)
   14344         7026 :         target = gen_reg_rtx (tmode);
   14345              :     }
   14346              : 
   14347        21193 :   for (i = 0; i < nargs; i++)
   14348              :     {
   14349         9385 :       machine_mode mode = insn_p->operand[i + 1].mode;
   14350              : 
   14351         9385 :       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
   14352         9385 :       op = ix86_expand_unsigned_small_int_cst_argument (arg);
   14353              : 
   14354         9385 :       if (i == memory)
   14355              :         {
   14356              :           /* This must be the memory operand.  */
   14357         2352 :           op = ix86_zero_extend_to_Pmode (op);
   14358         2352 :           op = gen_rtx_MEM (mode, op);
   14359              :           /* op at this point has just BITS_PER_UNIT MEM_ALIGN
   14360              :              on it.  Try to improve it using get_pointer_alignment,
   14361              :              and if the special builtin is one that requires strict
   14362              :              mode alignment, also from it's GET_MODE_ALIGNMENT.
   14363              :              Failure to do so could lead to ix86_legitimate_combined_insn
   14364              :              rejecting all changes to such insns.  */
   14365         2352 :           unsigned int align = get_pointer_alignment (arg);
   14366         2352 :           if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
   14367          299 :             align = GET_MODE_ALIGNMENT (mode);
   14368         4704 :           if (MEM_ALIGN (op) < align)
   14369          523 :             set_mem_align (op, align);
   14370              :         }
   14371         7033 :       else if (i == constant)
   14372              :         {
   14373              :           /* This must be the constant.  */
   14374          105 :           if (!insn_p->operand[nargs].predicate(op, SImode))
   14375              :             {
   14376            0 :               error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
   14377            0 :               return const0_rtx;
   14378              :             }
   14379              :         }
   14380              :       else
   14381              :         {
   14382              :           /* This must be register.  */
   14383         6928 :           if (VECTOR_MODE_P (mode))
   14384         3474 :             op = safe_vector_operand (op, mode);
   14385              : 
   14386         6928 :           op = fixup_modeless_constant (op, mode);
   14387              : 
   14388              :           /* NB: 3-operands load implied it's a mask load or v{p}expand*,
   14389              :              and that mask operand shoud be at the end.
   14390              :              Keep all-ones mask which would be simplified by the expander.  */
   14391         1770 :           if (nargs == 3 && i == 2 && klass == load
   14392         1770 :               && constm1_operand (op, mode)
   14393         7101 :               && insn_p->operand[i].predicate (op, mode))
   14394              :             ;
   14395         6928 :           else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   14396         6928 :             op = copy_to_mode_reg (mode, op);
   14397              :           else
   14398              :             {
   14399            0 :               op = copy_to_reg (op);
   14400            0 :               op = lowpart_subreg (mode, op, GET_MODE (op));
   14401              :             }
   14402              :         }
   14403              : 
   14404         9385 :       xops[i]= op;
   14405              :     }
   14406              : 
   14407        11808 :   switch (nargs)
   14408              :     {
   14409         7741 :     case 0:
   14410         7741 :       pat = GEN_FCN (icode) (target);
   14411         7741 :       break;
   14412          729 :     case 1:
   14413          729 :       pat = GEN_FCN (icode) (target, xops[0]);
   14414          729 :       break;
   14415         1463 :     case 2:
   14416         1463 :       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   14417         1463 :       break;
   14418         1770 :     case 3:
   14419         1770 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   14420         1770 :       break;
   14421          105 :     case 4:
   14422          105 :       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
   14423          105 :       break;
   14424              :     default:
   14425              :       gcc_unreachable ();
   14426              :     }
   14427              : 
   14428        11808 :   if (! pat)
   14429              :     return 0;
   14430              : 
   14431        11808 :   emit_insn (pat);
   14432        11808 :   return klass == store ? 0 : target;
   14433              : }
   14434              : 
   14435              : /* Return the integer constant in ARG.  Constrain it to be in the range
   14436              :    of the subparts of VEC_TYPE; issue an error if not.  */
   14437              : 
   14438              : static int
   14439          603 : get_element_number (tree vec_type, tree arg)
   14440              : {
   14441          603 :   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
   14442              : 
   14443          603 :   if (!tree_fits_uhwi_p (arg)
   14444          603 :       || (elt = tree_to_uhwi (arg), elt > max))
   14445              :     {
   14446            0 :       error ("selector must be an integer constant in the range "
   14447              :              "[0, %wi]", max);
   14448            0 :       return 0;
   14449              :     }
   14450              : 
   14451          603 :   return elt;
   14452              : }
   14453              : 
   14454              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14455              :    ix86_expand_vector_init.  We DO have language-level syntax for this, in
   14456              :    the form of  (type){ init-list }.  Except that since we can't place emms
   14457              :    instructions from inside the compiler, we can't allow the use of MMX
   14458              :    registers unless the user explicitly asks for it.  So we do *not* define
   14459              :    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
   14460              :    we have builtins invoked by mmintrin.h that gives us license to emit
   14461              :    these sorts of instructions.  */
   14462              : 
   14463              : static rtx
   14464          229 : ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
   14465              : {
   14466          229 :   machine_mode tmode = TYPE_MODE (type);
   14467          229 :   machine_mode inner_mode = GET_MODE_INNER (tmode);
   14468          229 :   int i, n_elt = GET_MODE_NUNITS (tmode);
   14469          229 :   rtvec v = rtvec_alloc (n_elt);
   14470              : 
   14471          229 :   gcc_assert (VECTOR_MODE_P (tmode));
   14472          229 :   gcc_assert (call_expr_nargs (exp) == n_elt);
   14473              : 
   14474         1203 :   for (i = 0; i < n_elt; ++i)
   14475              :     {
   14476          974 :       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
   14477          974 :       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
   14478              :     }
   14479              : 
   14480          229 :   if (!target || !register_operand (target, tmode))
   14481            0 :     target = gen_reg_rtx (tmode);
   14482              : 
   14483          229 :   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
   14484          229 :   return target;
   14485              : }
   14486              : 
   14487              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14488              :    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
   14489              :    had a language-level syntax for referencing vector elements.  */
   14490              : 
   14491              : static rtx
   14492          399 : ix86_expand_vec_ext_builtin (tree exp, rtx target)
   14493              : {
   14494          399 :   machine_mode tmode, mode0;
   14495          399 :   tree arg0, arg1;
   14496          399 :   int elt;
   14497          399 :   rtx op0;
   14498              : 
   14499          399 :   arg0 = CALL_EXPR_ARG (exp, 0);
   14500          399 :   arg1 = CALL_EXPR_ARG (exp, 1);
   14501              : 
   14502          399 :   op0 = expand_normal (arg0);
   14503          399 :   elt = get_element_number (TREE_TYPE (arg0), arg1);
   14504              : 
   14505          399 :   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
   14506          399 :   mode0 = TYPE_MODE (TREE_TYPE (arg0));
   14507          399 :   gcc_assert (VECTOR_MODE_P (mode0));
   14508              : 
   14509          399 :   op0 = force_reg (mode0, op0);
   14510              : 
   14511          399 :   if (optimize || !target || !register_operand (target, tmode))
   14512          320 :     target = gen_reg_rtx (tmode);
   14513              : 
   14514          399 :   ix86_expand_vector_extract (true, target, op0, elt);
   14515              : 
   14516          399 :   return target;
   14517              : }
   14518              : 
   14519              : /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   14520              :    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
   14521              :    a language-level syntax for referencing vector elements.  */
   14522              : 
   14523              : static rtx
   14524          204 : ix86_expand_vec_set_builtin (tree exp)
   14525              : {
   14526          204 :   machine_mode tmode, mode1;
   14527          204 :   tree arg0, arg1, arg2;
   14528          204 :   int elt;
   14529          204 :   rtx op0, op1, target;
   14530              : 
   14531          204 :   arg0 = CALL_EXPR_ARG (exp, 0);
   14532          204 :   arg1 = CALL_EXPR_ARG (exp, 1);
   14533          204 :   arg2 = CALL_EXPR_ARG (exp, 2);
   14534              : 
   14535          204 :   tmode = TYPE_MODE (TREE_TYPE (arg0));
   14536          204 :   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
   14537          204 :   gcc_assert (VECTOR_MODE_P (tmode));
   14538              : 
   14539          204 :   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
   14540          204 :   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
   14541          204 :   elt = get_element_number (TREE_TYPE (arg0), arg2);
   14542              : 
   14543          204 :   if (GET_MODE (op1) != mode1)
   14544           82 :     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
   14545              : 
   14546          204 :   op0 = force_reg (tmode, op0);
   14547          204 :   op1 = force_reg (mode1, op1);
   14548              : 
   14549              :   /* OP0 is the source of these builtin functions and shouldn't be
   14550              :      modified.  Create a copy, use it and return it as target.  */
   14551          204 :   target = gen_reg_rtx (tmode);
   14552          204 :   emit_move_insn (target, op0);
   14553          204 :   ix86_expand_vector_set (true, target, op1, elt);
   14554              : 
   14555          204 :   return target;
   14556              : }
   14557              : 
   14558              : /* Return true if the necessary isa options for this builtin exist,
   14559              :    else false.
   14560              :    fcode = DECL_MD_FUNCTION_CODE (fndecl);  */
   14561              : bool
   14562      1266654 : ix86_check_builtin_isa_match (unsigned int fcode,
   14563              :                               HOST_WIDE_INT* pbisa,
   14564              :                               HOST_WIDE_INT* pbisa2)
   14565              : {
   14566      1266654 :   HOST_WIDE_INT isa = ix86_isa_flags;
   14567      1266654 :   HOST_WIDE_INT isa2 = ix86_isa_flags2;
   14568      1266654 :   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
   14569      1266654 :   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
   14570      1266654 :   HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
   14571              :   /* The general case is we require all the ISAs specified in bisa{,2}
   14572              :      to be enabled.
   14573              :      The exceptions are:
   14574              :      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
   14575              :      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
   14576              :      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
   14577              :      (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
   14578              :        OPTION_MASK_ISA2_AVXVNNI
   14579              :      (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
   14580              :        OPTION_MASK_ISA2_AVXIFMA
   14581              :      (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
   14582              :        OPTION_MASK_ISA2_AVXNECONVERT
   14583              :      OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
   14584              :      OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT8
   14585              :      OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT16
   14586              :      where for each such pair it is sufficient if either of the ISAs is
   14587              :      enabled, plus if it is ored with other options also those others.
   14588              :      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
   14589              : 
   14590              : #define SHARE_BUILTIN(A1, A2, B1, B2) \
   14591              :   if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
   14592              :        && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
   14593              :       && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
   14594              :           || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
   14595              :     { \
   14596              :       tmp_isa |= (A1) | (B1); \
   14597              :       tmp_isa2 |= (A2) | (B2); \
   14598              :     }
   14599              : 
   14600      1266654 :   SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
   14601      1266654 :   SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
   14602      1266654 :   SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
   14603      1266654 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
   14604      1266654 :                  OPTION_MASK_ISA2_AVXVNNI);
   14605      1266654 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
   14606      1266654 :                  OPTION_MASK_ISA2_AVXIFMA);
   14607      1266654 :   SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
   14608      1266654 :                  OPTION_MASK_ISA2_AVXNECONVERT);
   14609      1266654 :   SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
   14610      1266654 :                  OPTION_MASK_ISA2_VAES);
   14611      1266654 :   SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT8, 0,
   14612      1266654 :                  OPTION_MASK_ISA2_AVX10_2);
   14613      1266654 :   SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT16, 0,
   14614      1266654 :                  OPTION_MASK_ISA2_AVX10_2);
   14615      1266654 :   isa = tmp_isa;
   14616      1266654 :   isa2 = tmp_isa2;
   14617              : 
   14618      1266654 :   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
   14619              :       /* __builtin_ia32_maskmovq requires MMX registers.  */
   14620         4563 :       && fcode != IX86_BUILTIN_MASKMOVQ)
   14621              :     {
   14622         4554 :       bisa &= ~OPTION_MASK_ISA_MMX;
   14623         4554 :       bisa |= OPTION_MASK_ISA_SSE2;
   14624              :     }
   14625              : 
   14626      1266654 :   if (pbisa)
   14627       171589 :     *pbisa = bisa;
   14628      1266654 :   if (pbisa2)
   14629       171589 :     *pbisa2 = bisa2;
   14630              : 
   14631      1266654 :   return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
   14632              : }
   14633              : 
   14634              : /* Emit instructions to set the carry flag from ARG.  */
   14635              : 
   14636              : void
   14637        13074 : ix86_expand_carry (rtx arg)
   14638              : {
   14639        13074 :   if (!CONST_INT_P (arg) || arg == const0_rtx)
   14640              :     {
   14641        13068 :       arg = convert_to_mode (QImode, arg, 1);
   14642        13068 :       arg = copy_to_mode_reg (QImode, arg);
   14643        13068 :       emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
   14644              :     }
   14645              :   else
   14646            6 :     emit_insn (gen_x86_stc ());
   14647        13074 : }
   14648              : 
   14649              : /* Expand an expression EXP that calls a built-in function,
   14650              :    with result going to TARGET if that's convenient
   14651              :    (and in mode MODE if that's convenient).
   14652              :    SUBTARGET may be used as the target for computing one of EXP's operands.
   14653              :    IGNORE is nonzero if the value is to be ignored.  */
   14654              : 
   14655              : rtx
   14656       172368 : ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
   14657              :                      machine_mode mode, int ignore)
   14658              : {
   14659       172368 :   size_t i;
   14660       172368 :   enum insn_code icode, icode2;
   14661       172368 :   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   14662       172368 :   tree arg0, arg1, arg2, arg3, arg4;
   14663       172368 :   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
   14664       172368 :   machine_mode mode0, mode1, mode2, mode3, mode4;
   14665       172368 :   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
   14666       172368 :   HOST_WIDE_INT bisa, bisa2;
   14667              : 
   14668              :   /* For CPU builtins that can be folded, fold first and expand the fold.  */
   14669       172368 :   switch (fcode)
   14670              :     {
   14671          195 :     case IX86_BUILTIN_CPU_INIT:
   14672          195 :       {
   14673              :         /* Make it call __cpu_indicator_init in libgcc.  */
   14674          195 :         tree call_expr, fndecl, type;
   14675          195 :         type = build_function_type_list (integer_type_node, NULL_TREE);
   14676          195 :         fndecl = build_fn_decl ("__cpu_indicator_init", type);
   14677          195 :         call_expr = build_call_expr (fndecl, 0);
   14678          195 :         return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
   14679              :       }
   14680          584 :     case IX86_BUILTIN_CPU_IS:
   14681          584 :     case IX86_BUILTIN_CPU_SUPPORTS:
   14682          584 :       {
   14683          584 :         tree arg0 = CALL_EXPR_ARG (exp, 0);
   14684          584 :         tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
   14685          584 :         gcc_assert (fold_expr != NULL_TREE);
   14686          584 :         return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
   14687              :       }
   14688              :     }
   14689              : 
   14690       171589 :   if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
   14691              :     {
   14692           23 :       bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
   14693           23 :       if (TARGET_ABI_X32)
   14694            0 :         bisa |= OPTION_MASK_ABI_X32;
   14695              :       else
   14696           23 :         bisa |= OPTION_MASK_ABI_64;
   14697           23 :       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
   14698              :                                        (enum fpmath_unit) 0,
   14699              :                                        (enum prefer_vector_width) 0,
   14700              :                                        PVW_NONE, false, add_abi_p);
   14701           23 :       if (!opts)
   14702            0 :         error ("%qE needs unknown isa option", fndecl);
   14703              :       else
   14704              :         {
   14705           23 :           gcc_assert (opts != NULL);
   14706           23 :           error ("%qE needs isa option %s", fndecl, opts);
   14707           23 :           free (opts);
   14708              :         }
   14709           23 :       return expand_call (exp, target, ignore);
   14710              :     }
   14711              : 
   14712       171566 :   switch (fcode)
   14713              :     {
   14714           35 :     case IX86_BUILTIN_MASKMOVQ:
   14715           35 :     case IX86_BUILTIN_MASKMOVDQU:
   14716           34 :       icode = (fcode == IX86_BUILTIN_MASKMOVQ
   14717           35 :                ? CODE_FOR_mmx_maskmovq
   14718              :                : CODE_FOR_sse2_maskmovdqu);
   14719              :       /* Note the arg order is different from the operand order.  */
   14720           35 :       arg1 = CALL_EXPR_ARG (exp, 0);
   14721           35 :       arg2 = CALL_EXPR_ARG (exp, 1);
   14722           35 :       arg0 = CALL_EXPR_ARG (exp, 2);
   14723           35 :       op0 = expand_normal (arg0);
   14724           35 :       op1 = expand_normal (arg1);
   14725           35 :       op2 = expand_normal (arg2);
   14726           35 :       mode0 = insn_data[icode].operand[0].mode;
   14727           35 :       mode1 = insn_data[icode].operand[1].mode;
   14728           35 :       mode2 = insn_data[icode].operand[2].mode;
   14729              : 
   14730           35 :       op0 = ix86_zero_extend_to_Pmode (op0);
   14731           35 :       op0 = gen_rtx_MEM (mode1, op0);
   14732              : 
   14733           35 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   14734            0 :         op0 = copy_to_mode_reg (mode0, op0);
   14735           35 :       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   14736            2 :         op1 = copy_to_mode_reg (mode1, op1);
   14737           35 :       if (!insn_data[icode].operand[2].predicate (op2, mode2))
   14738            2 :         op2 = copy_to_mode_reg (mode2, op2);
   14739           35 :       pat = GEN_FCN (icode) (op0, op1, op2);
   14740           35 :       if (! pat)
   14741        56617 :         return 0;
   14742           35 :       emit_insn (pat);
   14743           35 :       return 0;
   14744              : 
   14745        22008 :     case IX86_BUILTIN_LDMXCSR:
   14746        22008 :       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
   14747        22008 :       target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
   14748        22008 :       emit_move_insn (target, op0);
   14749        22008 :       emit_insn (gen_sse_ldmxcsr (target));
   14750        22008 :       return 0;
   14751              : 
   14752        14785 :     case IX86_BUILTIN_STMXCSR:
   14753        14785 :       target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
   14754        14785 :       emit_insn (gen_sse_stmxcsr (target));
   14755        14785 :       return copy_to_mode_reg (SImode, target);
   14756              : 
   14757           11 :     case IX86_BUILTIN_CLFLUSH:
   14758           11 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14759           11 :         op0 = expand_normal (arg0);
   14760           11 :         icode = CODE_FOR_sse2_clflush;
   14761           11 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14762            5 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14763              : 
   14764           11 :         emit_insn (gen_sse2_clflush (op0));
   14765           11 :         return 0;
   14766              : 
   14767           19 :     case IX86_BUILTIN_CLWB:
   14768           19 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14769           19 :         op0 = expand_normal (arg0);
   14770           19 :         icode = CODE_FOR_clwb;
   14771           19 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14772            9 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14773              : 
   14774           19 :         emit_insn (gen_clwb (op0));
   14775           19 :         return 0;
   14776              : 
   14777           19 :     case IX86_BUILTIN_CLFLUSHOPT:
   14778           19 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14779           19 :         op0 = expand_normal (arg0);
   14780           19 :         icode = CODE_FOR_clflushopt;
   14781           19 :         if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14782            9 :           op0 = ix86_zero_extend_to_Pmode (op0);
   14783              : 
   14784           19 :         emit_insn (gen_clflushopt (op0));
   14785           19 :         return 0;
   14786              : 
   14787           47 :     case IX86_BUILTIN_MONITOR:
   14788           47 :     case IX86_BUILTIN_MONITORX:
   14789           47 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14790           47 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14791           47 :       arg2 = CALL_EXPR_ARG (exp, 2);
   14792           47 :       op0 = expand_normal (arg0);
   14793           47 :       op1 = expand_normal (arg1);
   14794           47 :       op2 = expand_normal (arg2);
   14795           47 :       if (!REG_P (op0))
   14796           19 :         op0 = ix86_zero_extend_to_Pmode (op0);
   14797           47 :       if (!REG_P (op1))
   14798           22 :         op1 = copy_to_mode_reg (SImode, op1);
   14799           47 :       if (!REG_P (op2))
   14800           25 :         op2 = copy_to_mode_reg (SImode, op2);
   14801              : 
   14802           47 :       emit_insn (fcode == IX86_BUILTIN_MONITOR
   14803           26 :                  ? gen_sse3_monitor (Pmode, op0, op1, op2)
   14804           21 :                  : gen_monitorx (Pmode, op0, op1, op2));
   14805           47 :       return 0;
   14806              : 
   14807           25 :     case IX86_BUILTIN_MWAIT:
   14808           25 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14809           25 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14810           25 :       op0 = expand_normal (arg0);
   14811           25 :       op1 = expand_normal (arg1);
   14812           25 :       if (!REG_P (op0))
   14813           13 :         op0 = copy_to_mode_reg (SImode, op0);
   14814           25 :       if (!REG_P (op1))
   14815           11 :         op1 = copy_to_mode_reg (SImode, op1);
   14816           25 :       emit_insn (gen_sse3_mwait (op0, op1));
   14817           25 :       return 0;
   14818              : 
   14819           21 :     case IX86_BUILTIN_MWAITX:
   14820           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14821           21 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14822           21 :       arg2 = CALL_EXPR_ARG (exp, 2);
   14823           21 :       op0 = expand_normal (arg0);
   14824           21 :       op1 = expand_normal (arg1);
   14825           21 :       op2 = expand_normal (arg2);
   14826           21 :       if (!REG_P (op0))
   14827           11 :         op0 = copy_to_mode_reg (SImode, op0);
   14828           21 :       if (!REG_P (op1))
   14829           10 :         op1 = copy_to_mode_reg (SImode, op1);
   14830           21 :       if (!REG_P (op2))
   14831           11 :         op2 = copy_to_mode_reg (SImode, op2);
   14832           21 :       emit_insn (gen_mwaitx (op0, op1, op2));
   14833           21 :       return 0;
   14834              : 
   14835           21 :     case IX86_BUILTIN_UMONITOR:
   14836           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14837           21 :       op0 = expand_normal (arg0);
   14838              : 
   14839           21 :       op0 = ix86_zero_extend_to_Pmode (op0);
   14840           21 :       emit_insn (gen_umonitor (Pmode, op0));
   14841           21 :       return 0;
   14842              : 
   14843           42 :     case IX86_BUILTIN_UMWAIT:
   14844           42 :     case IX86_BUILTIN_TPAUSE:
   14845           42 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14846           42 :       arg1 = CALL_EXPR_ARG (exp, 1);
   14847           42 :       op0 = expand_normal (arg0);
   14848           42 :       op1 = expand_normal (arg1);
   14849              : 
   14850           42 :       if (!REG_P (op0))
   14851           20 :         op0 = copy_to_mode_reg (SImode, op0);
   14852              : 
   14853           42 :       op1 = force_reg (DImode, op1);
   14854              : 
   14855           42 :       if (TARGET_64BIT)
   14856              :         {
   14857           42 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   14858              :                                      NULL, 1, OPTAB_DIRECT);
   14859           42 :           switch (fcode)
   14860              :             {
   14861              :             case IX86_BUILTIN_UMWAIT:
   14862              :               icode = CODE_FOR_umwait_rex64;
   14863              :               break;
   14864           21 :             case IX86_BUILTIN_TPAUSE:
   14865           21 :               icode = CODE_FOR_tpause_rex64;
   14866           21 :               break;
   14867            0 :             default:
   14868            0 :               gcc_unreachable ();
   14869              :             }
   14870              : 
   14871           42 :           op2 = gen_lowpart (SImode, op2);
   14872           42 :           op1 = gen_lowpart (SImode, op1);
   14873           42 :           pat = GEN_FCN (icode) (op0, op1, op2);
   14874              :         }
   14875              :       else
   14876              :         {
   14877            0 :           switch (fcode)
   14878              :             {
   14879              :             case IX86_BUILTIN_UMWAIT:
   14880              :               icode = CODE_FOR_umwait;
   14881              :               break;
   14882            0 :             case IX86_BUILTIN_TPAUSE:
   14883            0 :               icode = CODE_FOR_tpause;
   14884            0 :               break;
   14885            0 :             default:
   14886            0 :               gcc_unreachable ();
   14887              :             }
   14888            0 :           pat = GEN_FCN (icode) (op0, op1);
   14889              :         }
   14890              : 
   14891           42 :       if (!pat)
   14892              :         return 0;
   14893              : 
   14894           42 :       emit_insn (pat);
   14895              : 
   14896           42 :       if (target == 0
   14897           42 :           || !register_operand (target, QImode))
   14898            0 :         target = gen_reg_rtx (QImode);
   14899              : 
   14900           42 :       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   14901              :                         const0_rtx);
   14902           42 :       emit_insn (gen_rtx_SET (target, pat));
   14903              : 
   14904           42 :       return target;
   14905              : 
   14906           20 :     case IX86_BUILTIN_TESTUI:
   14907           20 :       emit_insn (gen_testui ());
   14908              : 
   14909           20 :       if (target == 0
   14910           20 :           || !register_operand (target, QImode))
   14911            0 :         target = gen_reg_rtx (QImode);
   14912              : 
   14913           20 :       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   14914              :                          const0_rtx);
   14915           20 :       emit_insn (gen_rtx_SET (target, pat));
   14916              : 
   14917           20 :       return target;
   14918              : 
   14919           19 :     case IX86_BUILTIN_CLZERO:
   14920           19 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14921           19 :       op0 = expand_normal (arg0);
   14922           19 :       if (!REG_P (op0))
   14923            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   14924           19 :       emit_insn (gen_clzero (Pmode, op0));
   14925           19 :       return 0;
   14926              : 
   14927           19 :     case IX86_BUILTIN_CLDEMOTE:
   14928           19 :       arg0 = CALL_EXPR_ARG (exp, 0);
   14929           19 :       op0 = expand_normal (arg0);
   14930           19 :       icode = CODE_FOR_cldemote;
   14931           19 :       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14932            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   14933              : 
   14934           19 :       emit_insn (gen_cldemote (op0));
   14935           19 :       return 0;
   14936              : 
   14937           11 :     case IX86_BUILTIN_LOADIWKEY:
   14938           11 :       {
   14939           11 :         arg0 = CALL_EXPR_ARG (exp, 0);
   14940           11 :         arg1 = CALL_EXPR_ARG (exp, 1);
   14941           11 :         arg2 = CALL_EXPR_ARG (exp, 2);
   14942           11 :         arg3 = CALL_EXPR_ARG (exp, 3);
   14943              : 
   14944           11 :         op0 = expand_normal (arg0);
   14945           11 :         op1 = expand_normal (arg1);
   14946           11 :         op2 = expand_normal (arg2);
   14947           11 :         op3 = expand_normal (arg3);
   14948              : 
   14949           11 :         if (!REG_P (op0))
   14950            5 :           op0 = copy_to_mode_reg (V2DImode, op0);
   14951           11 :         if (!REG_P (op1))
   14952            5 :           op1 = copy_to_mode_reg (V2DImode, op1);
   14953           11 :         if (!REG_P (op2))
   14954            5 :           op2 = copy_to_mode_reg (V2DImode, op2);
   14955           11 :         if (!REG_P (op3))
   14956            5 :           op3 = copy_to_mode_reg (SImode, op3);
   14957              : 
   14958           11 :         emit_insn (gen_loadiwkey (op0, op1, op2, op3));
   14959              : 
   14960           11 :         return 0;
   14961              :       }
   14962              : 
   14963           12 :     case IX86_BUILTIN_AESDEC128KLU8:
   14964           12 :       icode = CODE_FOR_aesdec128klu8;
   14965           12 :       goto aesdecenc_expand;
   14966              : 
   14967           12 :     case IX86_BUILTIN_AESDEC256KLU8:
   14968           12 :       icode = CODE_FOR_aesdec256klu8;
   14969           12 :       goto aesdecenc_expand;
   14970              : 
   14971           12 :     case IX86_BUILTIN_AESENC128KLU8:
   14972           12 :       icode = CODE_FOR_aesenc128klu8;
   14973           12 :       goto aesdecenc_expand;
   14974              : 
   14975              :     case IX86_BUILTIN_AESENC256KLU8:
   14976              :       icode = CODE_FOR_aesenc256klu8;
   14977              : 
   14978           48 :     aesdecenc_expand:
   14979              : 
   14980           48 :       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
   14981           48 :       arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
   14982           48 :       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
   14983              : 
   14984           48 :       op0 = expand_normal (arg0);
   14985           48 :       op1 = expand_normal (arg1);
   14986           48 :       op2 = expand_normal (arg2);
   14987              : 
   14988           48 :       if (!address_operand (op0, V2DImode))
   14989              :         {
   14990           16 :           op0 = convert_memory_address (Pmode, op0);
   14991           16 :           op0 = copy_addr_to_reg (op0);
   14992              :         }
   14993           48 :       op0 = gen_rtx_MEM (V2DImode, op0);
   14994              : 
   14995           48 :       if (!REG_P (op1))
   14996           20 :         op1 = copy_to_mode_reg (V2DImode, op1);
   14997              : 
   14998           48 :       if (!address_operand (op2, VOIDmode))
   14999              :         {
   15000           16 :           op2 = convert_memory_address (Pmode, op2);
   15001           16 :           op2 = copy_addr_to_reg (op2);
   15002              :         }
   15003           48 :       op2 = gen_rtx_MEM (BLKmode, op2);
   15004              : 
   15005           48 :       emit_insn (GEN_FCN (icode) (op1, op1, op2));
   15006              : 
   15007           48 :       if (target == 0)
   15008            4 :         target = gen_reg_rtx (QImode);
   15009              : 
   15010              :       /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
   15011              :          error occurs. Then the output should be cleared for safety. */
   15012           48 :       rtx_code_label *ok_label;
   15013           48 :       rtx tmp;
   15014              : 
   15015           48 :       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
   15016           48 :       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
   15017           48 :       ok_label = gen_label_rtx ();
   15018           48 :       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
   15019              :                                true, ok_label);
   15020              :       /* Usually the runtime error seldom occur, so predict OK path as
   15021              :          hotspot to optimize it as fallthrough block. */
   15022           48 :       predict_jump (REG_BR_PROB_BASE * 90 / 100);
   15023              : 
   15024           48 :       emit_insn (gen_rtx_SET (op1, const0_rtx));
   15025              : 
   15026           48 :       emit_label (ok_label);
   15027           48 :       emit_insn (gen_rtx_SET (target, pat));
   15028           48 :       emit_insn (gen_rtx_SET (op0, op1));
   15029              : 
   15030           48 :       return target;
   15031              : 
   15032           11 :     case IX86_BUILTIN_AESDECWIDE128KLU8:
   15033           11 :       icode = CODE_FOR_aesdecwide128klu8;
   15034           11 :       goto wideaesdecenc_expand;
   15035              : 
   15036           11 :     case IX86_BUILTIN_AESDECWIDE256KLU8:
   15037           11 :       icode = CODE_FOR_aesdecwide256klu8;
   15038           11 :       goto wideaesdecenc_expand;
   15039              : 
   15040           11 :     case IX86_BUILTIN_AESENCWIDE128KLU8:
   15041           11 :       icode = CODE_FOR_aesencwide128klu8;
   15042           11 :       goto wideaesdecenc_expand;
   15043              : 
   15044              :     case IX86_BUILTIN_AESENCWIDE256KLU8:
   15045              :       icode = CODE_FOR_aesencwide256klu8;
   15046              : 
   15047           44 :     wideaesdecenc_expand:
   15048              : 
   15049           44 :       rtx xmm_regs[8];
   15050           44 :       rtx op;
   15051              : 
   15052           44 :       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
   15053           44 :       arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
   15054           44 :       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
   15055              : 
   15056           44 :       op0 = expand_normal (arg0);
   15057           44 :       op1 = expand_normal (arg1);
   15058           44 :       op2 = expand_normal (arg2);
   15059              : 
   15060           44 :       if (GET_MODE (op1) != Pmode)
   15061            0 :         op1 = convert_to_mode (Pmode, op1, 1);
   15062              : 
   15063           44 :       if (!address_operand (op2, VOIDmode))
   15064              :         {
   15065           16 :           op2 = convert_memory_address (Pmode, op2);
   15066           16 :           op2 = copy_addr_to_reg (op2);
   15067              :         }
   15068           44 :       op2 = gen_rtx_MEM (BLKmode, op2);
   15069              : 
   15070          440 :       for (i = 0; i < 8; i++)
   15071              :         {
   15072          352 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15073              : 
   15074          352 :           op = gen_rtx_MEM (V2DImode,
   15075          352 :                             plus_constant (Pmode, op1, (i * 16)));
   15076              : 
   15077          352 :           emit_move_insn (xmm_regs[i], op);
   15078              :         }
   15079              : 
   15080           44 :       emit_insn (GEN_FCN (icode) (op2));
   15081              : 
   15082           44 :       if (target == 0)
   15083            0 :         target = gen_reg_rtx (QImode);
   15084              : 
   15085           44 :       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
   15086           44 :       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
   15087           44 :       ok_label = gen_label_rtx ();
   15088           44 :       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
   15089              :                                true, ok_label);
   15090           44 :       predict_jump (REG_BR_PROB_BASE * 90 / 100);
   15091              : 
   15092          440 :       for (i = 0; i < 8; i++)
   15093          352 :         emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
   15094              : 
   15095           44 :       emit_label (ok_label);
   15096           44 :       emit_insn (gen_rtx_SET (target, pat));
   15097              : 
   15098           44 :       if (GET_MODE (op0) != Pmode)
   15099            0 :         op0 = convert_to_mode (Pmode, op0, 1);
   15100              : 
   15101          396 :       for (i = 0; i < 8; i++)
   15102              :         {
   15103          352 :           op = gen_rtx_MEM (V2DImode,
   15104          352 :                             plus_constant (Pmode, op0, (i * 16)));
   15105          352 :           emit_move_insn (op, xmm_regs[i]);
   15106              :         }
   15107              : 
   15108              :       return target;
   15109              : 
   15110           13 :     case IX86_BUILTIN_ENCODEKEY128U32:
   15111           13 :       {
   15112           13 :         rtx op, xmm_regs[7];
   15113              : 
   15114           13 :         arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
   15115           13 :         arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
   15116           13 :         arg2 = CALL_EXPR_ARG (exp, 2); // void *h
   15117              : 
   15118           13 :         op0 = expand_normal (arg0);
   15119           13 :         op1 = expand_normal (arg1);
   15120           13 :         op2 = expand_normal (arg2);
   15121              : 
   15122           13 :         if (!REG_P (op0))
   15123            7 :           op0 = copy_to_mode_reg (SImode, op0);
   15124              : 
   15125           13 :         if (GET_MODE (op2) != Pmode)
   15126            1 :           op2 = convert_to_mode (Pmode, op2, 1);
   15127              : 
   15128           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
   15129           13 :         emit_move_insn (op, op1);
   15130              : 
   15131           65 :         for (i = 0; i < 3; i++)
   15132           39 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15133              : 
   15134           13 :         if (target == 0 || !register_operand (target, SImode))
   15135            2 :           target = gen_reg_rtx (SImode);
   15136              : 
   15137           13 :         emit_insn (gen_encodekey128u32 (target, op0));
   15138              : 
   15139           65 :         for (i = 0; i < 3; i++)
   15140              :           {
   15141           39 :             op = gen_rtx_MEM (V2DImode,
   15142           39 :                               plus_constant (Pmode, op2, (i * 16)));
   15143           39 :             emit_move_insn (op, xmm_regs[i]);
   15144              :           }
   15145              : 
   15146           13 :         return target;
   15147              :       }
   15148           13 :     case IX86_BUILTIN_ENCODEKEY256U32:
   15149           13 :       {
   15150           13 :         rtx op, xmm_regs[7];
   15151              : 
   15152           13 :         arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
   15153           13 :         arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
   15154           13 :         arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
   15155           13 :         arg3 = CALL_EXPR_ARG (exp, 3); // void *h
   15156              : 
   15157           13 :         op0 = expand_normal (arg0);
   15158           13 :         op1 = expand_normal (arg1);
   15159           13 :         op2 = expand_normal (arg2);
   15160           13 :         op3 = expand_normal (arg3);
   15161              : 
   15162           13 :         if (!REG_P (op0))
   15163            7 :           op0 = copy_to_mode_reg (SImode, op0);
   15164              : 
   15165           13 :         if (GET_MODE (op3) != Pmode)
   15166            1 :           op3 = convert_to_mode (Pmode, op3, 1);
   15167              : 
   15168              :         /* Force to use xmm0, xmm1 for keylow, keyhi*/
   15169           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
   15170           13 :         emit_move_insn (op, op1);
   15171           13 :         op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
   15172           13 :         emit_move_insn (op, op2);
   15173              : 
   15174           78 :         for (i = 0; i < 4; i++)
   15175           52 :           xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   15176              : 
   15177           13 :         if (target == 0 || !register_operand (target, SImode))
   15178            2 :           target = gen_reg_rtx (SImode);
   15179              : 
   15180           13 :         emit_insn (gen_encodekey256u32 (target, op0));
   15181              : 
   15182           78 :         for (i = 0; i < 4; i++)
   15183              :           {
   15184           52 :             op = gen_rtx_MEM (V2DImode,
   15185           52 :                               plus_constant (Pmode, op3, (i * 16)));
   15186           52 :             emit_move_insn (op, xmm_regs[i]);
   15187              :           }
   15188              : 
   15189           13 :         return target;
   15190              :       }
   15191              : 
   15192           48 :     case IX86_BUILTIN_PREFETCH:
   15193           48 :       {
   15194           48 :         arg0 = CALL_EXPR_ARG (exp, 0); // const void *
   15195           48 :         arg1 = CALL_EXPR_ARG (exp, 1); // const int
   15196           48 :         arg2 = CALL_EXPR_ARG (exp, 2); // const int
   15197           48 :         arg3 = CALL_EXPR_ARG (exp, 3); // const int
   15198              : 
   15199           48 :         op0 = expand_normal (arg0);
   15200           48 :         op1 = expand_normal (arg1);
   15201           48 :         op2 = expand_normal (arg2);
   15202           48 :         op3 = expand_normal (arg3);
   15203              : 
   15204           48 :         if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
   15205              :           {
   15206            0 :             error ("second, third and fourth argument must be a const");
   15207            0 :             return const0_rtx;
   15208              :           }
   15209              : 
   15210           48 :         if (!IN_RANGE (INTVAL (op1), 0, 2))
   15211              :           {
   15212            1 :             warning (0, "invalid second argument to"
   15213              :                      " %<__builtin_ia32_prefetch%>; using zero");
   15214            1 :             op1 = const0_rtx;
   15215              :           }
   15216              : 
   15217           48 :         if (INTVAL (op3) == 1)
   15218              :           {
   15219            4 :             if (!IN_RANGE (INTVAL (op2), 2, 3))
   15220              :               {
   15221            1 :                 error ("invalid third argument");
   15222            1 :                 return const0_rtx;
   15223              :               }
   15224              : 
   15225            3 :             if (TARGET_64BIT && TARGET_PREFETCHI
   15226            6 :                 && local_func_symbolic_operand (op0, GET_MODE (op0)))
   15227            2 :               emit_insn (gen_prefetchi (op0, op2));
   15228              :             else
   15229              :               {
   15230            1 :                 warning (0, "instruction prefetch applies when in 64-bit mode"
   15231              :                             " with RIP-relative addressing and"
   15232              :                             " option %<-mprefetchi%>;"
   15233              :                             " they stay NOPs otherwise");
   15234            1 :                 emit_insn (gen_nop ());
   15235              :               }
   15236              :           }
   15237              :         else
   15238              :           {
   15239           44 :             if (INTVAL (op3) != 0)
   15240            1 :               warning (0, "invalid forth argument to"
   15241              :                           " %<__builtin_ia32_prefetch%>; using zero");
   15242              : 
   15243           44 :             if (!address_operand (op0, VOIDmode))
   15244              :               {
   15245           10 :                 op0 = convert_memory_address (Pmode, op0);
   15246           10 :                 op0 = copy_addr_to_reg (op0);
   15247              :               }
   15248              : 
   15249           44 :             if (!IN_RANGE (INTVAL (op2), 0, 3))
   15250              :               {
   15251            1 :                 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
   15252            1 :                 op2 = const0_rtx;
   15253              :               }
   15254              : 
   15255           44 :             if (TARGET_3DNOW
   15256           26 :                 || TARGET_PREFETCH_SSE
   15257            0 :                 || TARGET_PRFCHW
   15258            0 :                 || TARGET_MOVRS)
   15259           44 :               emit_insn (gen_prefetch (op0, op1, op2));
   15260            0 :             else if (!MEM_P (op0) && side_effects_p (op0))
   15261              :               /* Don't do anything with direct references to volatile memory,
   15262              :                  but generate code to handle other side effects.  */
   15263            0 :               emit_insn (op0);
   15264              :           }
   15265              : 
   15266              :         return 0;
   15267              :       }
   15268              : 
   15269           21 :     case IX86_BUILTIN_PREFETCHI:
   15270           21 :       {
   15271           21 :         arg0 = CALL_EXPR_ARG (exp, 0); // const void *
   15272           21 :         arg1 = CALL_EXPR_ARG (exp, 1); // const int
   15273              : 
   15274           21 :         op0 = expand_normal (arg0);
   15275           21 :         op1 = expand_normal (arg1);
   15276              : 
   15277           21 :         if (!CONST_INT_P (op1))
   15278              :           {
   15279            0 :             error ("second argument must be a const");
   15280            0 :             return const0_rtx;
   15281              :           }
   15282              : 
   15283              :         /* GOT/PLT_PIC should not be available for instruction prefetch.
   15284              :            It must be real instruction address.  */
   15285           21 :         if (TARGET_64BIT
   15286           21 :             && local_func_symbolic_operand (op0, GET_MODE (op0)))
   15287            4 :           emit_insn (gen_prefetchi (op0, op1));
   15288              :         else
   15289              :           {
   15290              :             /* Ignore the hint.  */
   15291           17 :             warning (0, "instruction prefetch applies when in 64-bit mode"
   15292              :                         " with RIP-relative addressing and"
   15293              :                         " option %<-mprefetchi%>;"
   15294              :                         " they stay NOPs otherwise");
   15295           17 :             emit_insn (gen_nop ());
   15296              :           }
   15297              : 
   15298              :         return 0;
   15299              :       }
   15300              : 
   15301           53 :     case IX86_BUILTIN_URDMSR:
   15302           53 :     case IX86_BUILTIN_UWRMSR:
   15303           53 :       {
   15304           53 :         arg0 = CALL_EXPR_ARG (exp, 0);
   15305           53 :         op0 = expand_normal (arg0);
   15306              : 
   15307           53 :         if (CONST_INT_P (op0))
   15308              :           {
   15309           12 :             unsigned HOST_WIDE_INT val = UINTVAL (op0);
   15310           12 :             if (val > 0xffffffff)
   15311            2 :               op0 = force_reg (DImode, op0);
   15312              :           }
   15313              :         else
   15314           41 :           op0 = force_reg (DImode, op0);
   15315              : 
   15316           53 :         if (fcode == IX86_BUILTIN_UWRMSR)
   15317              :           {
   15318           26 :             arg1 = CALL_EXPR_ARG (exp, 1);
   15319           26 :             op1 = expand_normal (arg1);
   15320           26 :             op1 = force_reg (DImode, op1);
   15321           26 :             icode = CODE_FOR_uwrmsr;
   15322           26 :             target = 0;
   15323              :           }
   15324              :         else
   15325              :           {
   15326           27 :             if (target == 0 || !register_operand (target, DImode))
   15327            1 :               target = gen_reg_rtx (DImode);
   15328              :             icode = CODE_FOR_urdmsr;
   15329              :             op1 = op0;
   15330              :             op0 = target;
   15331              :           }
   15332           53 :         emit_insn (GEN_FCN (icode) (op0, op1));
   15333           53 :         return target;
   15334              :       }
   15335              : 
   15336          229 :     case IX86_BUILTIN_VEC_INIT_V2SI:
   15337          229 :     case IX86_BUILTIN_VEC_INIT_V4HI:
   15338          229 :     case IX86_BUILTIN_VEC_INIT_V8QI:
   15339          229 :       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
   15340              : 
   15341          399 :     case IX86_BUILTIN_VEC_EXT_V2DF:
   15342          399 :     case IX86_BUILTIN_VEC_EXT_V2DI:
   15343          399 :     case IX86_BUILTIN_VEC_EXT_V4SF:
   15344          399 :     case IX86_BUILTIN_VEC_EXT_V4SI:
   15345          399 :     case IX86_BUILTIN_VEC_EXT_V8HI:
   15346          399 :     case IX86_BUILTIN_VEC_EXT_V2SI:
   15347          399 :     case IX86_BUILTIN_VEC_EXT_V4HI:
   15348          399 :     case IX86_BUILTIN_VEC_EXT_V16QI:
   15349          399 :       return ix86_expand_vec_ext_builtin (exp, target);
   15350              : 
   15351          204 :     case IX86_BUILTIN_VEC_SET_V2DI:
   15352          204 :     case IX86_BUILTIN_VEC_SET_V4SF:
   15353          204 :     case IX86_BUILTIN_VEC_SET_V4SI:
   15354          204 :     case IX86_BUILTIN_VEC_SET_V8HI:
   15355          204 :     case IX86_BUILTIN_VEC_SET_V4HI:
   15356          204 :     case IX86_BUILTIN_VEC_SET_V16QI:
   15357          204 :       return ix86_expand_vec_set_builtin (exp);
   15358              : 
   15359            0 :     case IX86_BUILTIN_NANQ:
   15360            0 :     case IX86_BUILTIN_NANSQ:
   15361            0 :       return expand_call (exp, target, ignore);
   15362              : 
   15363           18 :     case IX86_BUILTIN_RDPID:
   15364              : 
   15365           18 :       op0 = gen_reg_rtx (word_mode);
   15366              : 
   15367           18 :       if (TARGET_64BIT)
   15368              :         {
   15369           18 :           insn = gen_rdpid_rex64 (op0);
   15370           18 :           op0 = convert_to_mode (SImode, op0, 1);
   15371              :         }
   15372              :       else
   15373            0 :         insn = gen_rdpid (op0);
   15374              : 
   15375           18 :       emit_insn (insn);
   15376              : 
   15377           18 :       if (target == 0
   15378           18 :           || !register_operand (target, SImode))
   15379            0 :         target = gen_reg_rtx (SImode);
   15380              : 
   15381           18 :       emit_move_insn (target, op0);
   15382           18 :       return target;
   15383              : 
   15384           75 :     case IX86_BUILTIN_2INTERSECTD512:
   15385           75 :     case IX86_BUILTIN_2INTERSECTQ512:
   15386           75 :     case IX86_BUILTIN_2INTERSECTD256:
   15387           75 :     case IX86_BUILTIN_2INTERSECTQ256:
   15388           75 :     case IX86_BUILTIN_2INTERSECTD128:
   15389           75 :     case IX86_BUILTIN_2INTERSECTQ128:
   15390           75 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15391           75 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15392           75 :       arg2 = CALL_EXPR_ARG (exp, 2);
   15393           75 :       arg3 = CALL_EXPR_ARG (exp, 3);
   15394           75 :       op0 = expand_normal (arg0);
   15395           75 :       op1 = expand_normal (arg1);
   15396           75 :       op2 = expand_normal (arg2);
   15397           75 :       op3 = expand_normal (arg3);
   15398              : 
   15399           75 :       if (!address_operand (op0, VOIDmode))
   15400              :         {
   15401           25 :           op0 = convert_memory_address (Pmode, op0);
   15402           25 :           op0 = copy_addr_to_reg (op0);
   15403              :         }
   15404           75 :       if (!address_operand (op1, VOIDmode))
   15405              :         {
   15406           25 :           op1 = convert_memory_address (Pmode, op1);
   15407           25 :           op1 = copy_addr_to_reg (op1);
   15408              :         }
   15409              : 
   15410           75 :       switch (fcode)
   15411              :         {
   15412              :         case IX86_BUILTIN_2INTERSECTD512:
   15413              :           mode4 = P2HImode;
   15414              :           icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
   15415              :           break;
   15416              :         case IX86_BUILTIN_2INTERSECTQ512:
   15417              :           mode4 = P2QImode;
   15418              :           icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
   15419              :           break;
   15420              :         case IX86_BUILTIN_2INTERSECTD256:
   15421              :           mode4 = P2QImode;
   15422              :           icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
   15423              :           break;
   15424              :         case IX86_BUILTIN_2INTERSECTQ256:
   15425              :           mode4 = P2QImode;
   15426              :           icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
   15427              :           break;
   15428              :         case IX86_BUILTIN_2INTERSECTD128:
   15429              :           mode4 = P2QImode;
   15430              :           icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
   15431              :           break;
   15432              :         case IX86_BUILTIN_2INTERSECTQ128:
   15433              :           mode4 = P2QImode;
   15434              :           icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
   15435              :           break;
   15436            0 :         default:
   15437            0 :           gcc_unreachable ();
   15438              :         }
   15439              : 
   15440           75 :       mode2 = insn_data[icode].operand[1].mode;
   15441           75 :       mode3 = insn_data[icode].operand[2].mode;
   15442           75 :       if (!insn_data[icode].operand[1].predicate (op2, mode2))
   15443           25 :         op2 = copy_to_mode_reg (mode2, op2);
   15444           75 :       if (!insn_data[icode].operand[2].predicate (op3, mode3))
   15445            6 :         op3 = copy_to_mode_reg (mode3, op3);
   15446              : 
   15447           75 :       op4 = gen_reg_rtx (mode4);
   15448           75 :       emit_insn (GEN_FCN (icode) (op4, op2, op3));
   15449           75 :       mode0 = mode4 == P2HImode ? HImode : QImode;
   15450           75 :       emit_move_insn (gen_rtx_MEM (mode0, op0),
   15451           75 :                       gen_lowpart (mode0, op4));
   15452           75 :       emit_move_insn (gen_rtx_MEM (mode0, op1),
   15453              :                       gen_highpart (mode0, op4));
   15454              : 
   15455           75 :       return 0;
   15456              : 
   15457          102 :     case IX86_BUILTIN_RDPMC:
   15458          102 :     case IX86_BUILTIN_RDTSC:
   15459          102 :     case IX86_BUILTIN_RDTSCP:
   15460          102 :     case IX86_BUILTIN_XGETBV:
   15461              : 
   15462          102 :       op0 = gen_reg_rtx (DImode);
   15463          102 :       op1 = gen_reg_rtx (DImode);
   15464              : 
   15465          102 :       if (fcode == IX86_BUILTIN_RDPMC)
   15466              :         {
   15467           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15468           22 :           op2 = expand_normal (arg0);
   15469           22 :           if (!register_operand (op2, SImode))
   15470           11 :             op2 = copy_to_mode_reg (SImode, op2);
   15471              : 
   15472           22 :           insn = (TARGET_64BIT
   15473           22 :                   ? gen_rdpmc_rex64 (op0, op1, op2)
   15474            0 :                   : gen_rdpmc (op0, op2));
   15475           22 :           emit_insn (insn);
   15476              :         }
   15477           80 :       else if (fcode == IX86_BUILTIN_XGETBV)
   15478              :         {
   15479           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15480           22 :           op2 = expand_normal (arg0);
   15481           22 :           if (!register_operand (op2, SImode))
   15482            1 :             op2 = copy_to_mode_reg (SImode, op2);
   15483              : 
   15484           22 :           insn = (TARGET_64BIT
   15485           22 :                   ? gen_xgetbv_rex64 (op0, op1, op2)
   15486            0 :                   : gen_xgetbv (op0, op2));
   15487           22 :           emit_insn (insn);
   15488              :         }
   15489           58 :       else if (fcode == IX86_BUILTIN_RDTSC)
   15490              :         {
   15491           36 :           insn = (TARGET_64BIT
   15492           36 :                   ? gen_rdtsc_rex64 (op0, op1)
   15493            2 :                   : gen_rdtsc (op0));
   15494           36 :           emit_insn (insn);
   15495              :         }
   15496              :       else
   15497              :         {
   15498           22 :           op2 = gen_reg_rtx (SImode);
   15499              : 
   15500           22 :           insn = (TARGET_64BIT
   15501           22 :                   ? gen_rdtscp_rex64 (op0, op1, op2)
   15502            0 :                   : gen_rdtscp (op0, op2));
   15503           22 :           emit_insn (insn);
   15504              : 
   15505           22 :           arg0 = CALL_EXPR_ARG (exp, 0);
   15506           22 :           op4 = expand_normal (arg0);
   15507           22 :           if (!address_operand (op4, VOIDmode))
   15508              :             {
   15509           10 :               op4 = convert_memory_address (Pmode, op4);
   15510           10 :               op4 = copy_addr_to_reg (op4);
   15511              :             }
   15512           22 :           emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
   15513              :         }
   15514              : 
   15515          102 :       if (target == 0
   15516          102 :           || !register_operand (target, DImode))
   15517           10 :         target = gen_reg_rtx (DImode);
   15518              : 
   15519          102 :       if (TARGET_64BIT)
   15520              :         {
   15521          100 :           op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
   15522              :                                      op1, 1, OPTAB_DIRECT);
   15523          100 :           op0 = expand_simple_binop (DImode, IOR, op0, op1,
   15524              :                                      op0, 1, OPTAB_DIRECT);
   15525              :         }
   15526              : 
   15527          102 :       emit_move_insn (target, op0);
   15528          102 :       return target;
   15529              : 
   15530           61 :     case IX86_BUILTIN_ENQCMD:
   15531           61 :     case IX86_BUILTIN_ENQCMDS:
   15532           61 :     case IX86_BUILTIN_MOVDIR64B:
   15533              : 
   15534           61 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15535           61 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15536           61 :       op0 = expand_normal (arg0);
   15537           61 :       op1 = expand_normal (arg1);
   15538              : 
   15539           61 :       op0 = ix86_zero_extend_to_Pmode (op0);
   15540           61 :       if (!address_operand (op1, VOIDmode))
   15541              :       {
   15542           28 :         op1 = convert_memory_address (Pmode, op1);
   15543           28 :         op1 = copy_addr_to_reg (op1);
   15544              :       }
   15545           61 :       op1 = gen_rtx_MEM (XImode, op1);
   15546              : 
   15547           61 :       if (fcode == IX86_BUILTIN_MOVDIR64B)
   15548              :         {
   15549           24 :           emit_insn (gen_movdir64b (Pmode, op0, op1));
   15550           23 :           return 0;
   15551              :         }
   15552              :       else
   15553              :         {
   15554           38 :           if (target == 0
   15555           38 :               || !register_operand (target, SImode))
   15556            0 :             target = gen_reg_rtx (SImode);
   15557              : 
   15558           38 :           emit_move_insn (target, const0_rtx);
   15559           38 :           target = gen_rtx_SUBREG (QImode, target, 0);
   15560              : 
   15561           19 :           int unspecv = (fcode == IX86_BUILTIN_ENQCMD
   15562           38 :                          ? UNSPECV_ENQCMD
   15563              :                          : UNSPECV_ENQCMDS);
   15564           38 :           icode = code_for_enqcmd (unspecv, Pmode);
   15565           38 :           emit_insn (GEN_FCN (icode) (op0, op1));
   15566              : 
   15567           38 :           emit_insn
   15568           38 :             (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   15569              :                           gen_rtx_fmt_ee (EQ, QImode,
   15570              :                                           gen_rtx_REG (CCZmode, FLAGS_REG),
   15571              :                                           const0_rtx)));
   15572           38 :           return SUBREG_REG (target);
   15573              :         }
   15574              : 
   15575        14775 :     case IX86_BUILTIN_FXSAVE:
   15576        14775 :     case IX86_BUILTIN_FXRSTOR:
   15577        14775 :     case IX86_BUILTIN_FXSAVE64:
   15578        14775 :     case IX86_BUILTIN_FXRSTOR64:
   15579        14775 :     case IX86_BUILTIN_FNSTENV:
   15580        14775 :     case IX86_BUILTIN_FLDENV:
   15581        14775 :       mode0 = BLKmode;
   15582        14775 :       switch (fcode)
   15583              :         {
   15584              :         case IX86_BUILTIN_FXSAVE:
   15585              :           icode = CODE_FOR_fxsave;
   15586              :           break;
   15587           19 :         case IX86_BUILTIN_FXRSTOR:
   15588           19 :           icode = CODE_FOR_fxrstor;
   15589           19 :           break;
   15590           23 :         case IX86_BUILTIN_FXSAVE64:
   15591           23 :           icode = CODE_FOR_fxsave64;
   15592           23 :           break;
   15593           21 :         case IX86_BUILTIN_FXRSTOR64:
   15594           21 :           icode = CODE_FOR_fxrstor64;
   15595           21 :           break;
   15596         7257 :         case IX86_BUILTIN_FNSTENV:
   15597         7257 :           icode = CODE_FOR_fnstenv;
   15598         7257 :           break;
   15599         7435 :         case IX86_BUILTIN_FLDENV:
   15600         7435 :           icode = CODE_FOR_fldenv;
   15601         7435 :           break;
   15602            0 :         default:
   15603            0 :           gcc_unreachable ();
   15604              :         }
   15605              : 
   15606        14775 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15607        14775 :       op0 = expand_normal (arg0);
   15608              : 
   15609        14775 :       if (!address_operand (op0, VOIDmode))
   15610              :         {
   15611           36 :           op0 = convert_memory_address (Pmode, op0);
   15612           36 :           op0 = copy_addr_to_reg (op0);
   15613              :         }
   15614        14775 :       op0 = gen_rtx_MEM (mode0, op0);
   15615              : 
   15616        14775 :       pat = GEN_FCN (icode) (op0);
   15617        14775 :       if (pat)
   15618        14775 :         emit_insn (pat);
   15619              :       return 0;
   15620              : 
   15621           21 :     case IX86_BUILTIN_XSETBV:
   15622           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15623           21 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15624           21 :       op0 = expand_normal (arg0);
   15625           21 :       op1 = expand_normal (arg1);
   15626              : 
   15627           21 :       if (!REG_P (op0))
   15628            1 :         op0 = copy_to_mode_reg (SImode, op0);
   15629              : 
   15630           21 :       op1 = force_reg (DImode, op1);
   15631              : 
   15632           21 :       if (TARGET_64BIT)
   15633              :         {
   15634           21 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   15635              :                                      NULL, 1, OPTAB_DIRECT);
   15636              : 
   15637           21 :           icode = CODE_FOR_xsetbv_rex64;
   15638              : 
   15639           21 :           op2 = gen_lowpart (SImode, op2);
   15640           21 :           op1 = gen_lowpart (SImode, op1);
   15641           21 :           pat = GEN_FCN (icode) (op0, op1, op2);
   15642              :         }
   15643              :       else
   15644              :         {
   15645            0 :           icode = CODE_FOR_xsetbv;
   15646              : 
   15647            0 :           pat = GEN_FCN (icode) (op0, op1);
   15648              :         }
   15649           21 :       if (pat)
   15650           21 :         emit_insn (pat);
   15651              :       return 0;
   15652              : 
   15653          232 :     case IX86_BUILTIN_XSAVE:
   15654          232 :     case IX86_BUILTIN_XRSTOR:
   15655          232 :     case IX86_BUILTIN_XSAVE64:
   15656          232 :     case IX86_BUILTIN_XRSTOR64:
   15657          232 :     case IX86_BUILTIN_XSAVEOPT:
   15658          232 :     case IX86_BUILTIN_XSAVEOPT64:
   15659          232 :     case IX86_BUILTIN_XSAVES:
   15660          232 :     case IX86_BUILTIN_XRSTORS:
   15661          232 :     case IX86_BUILTIN_XSAVES64:
   15662          232 :     case IX86_BUILTIN_XRSTORS64:
   15663          232 :     case IX86_BUILTIN_XSAVEC:
   15664          232 :     case IX86_BUILTIN_XSAVEC64:
   15665          232 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15666          232 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15667          232 :       op0 = expand_normal (arg0);
   15668          232 :       op1 = expand_normal (arg1);
   15669              : 
   15670          232 :       if (!address_operand (op0, VOIDmode))
   15671              :         {
   15672          108 :           op0 = convert_memory_address (Pmode, op0);
   15673          108 :           op0 = copy_addr_to_reg (op0);
   15674              :         }
   15675          232 :       op0 = gen_rtx_MEM (BLKmode, op0);
   15676              : 
   15677          232 :       op1 = force_reg (DImode, op1);
   15678              : 
   15679          232 :       if (TARGET_64BIT)
   15680              :         {
   15681          232 :           op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   15682              :                                      NULL, 1, OPTAB_DIRECT);
   15683          232 :           switch (fcode)
   15684              :             {
   15685              :             case IX86_BUILTIN_XSAVE:
   15686              :               icode = CODE_FOR_xsave_rex64;
   15687              :               break;
   15688           19 :             case IX86_BUILTIN_XRSTOR:
   15689           19 :               icode = CODE_FOR_xrstor_rex64;
   15690           19 :               break;
   15691           21 :             case IX86_BUILTIN_XSAVE64:
   15692           21 :               icode = CODE_FOR_xsave64;
   15693           21 :               break;
   15694           21 :             case IX86_BUILTIN_XRSTOR64:
   15695           21 :               icode = CODE_FOR_xrstor64;
   15696           21 :               break;
   15697           19 :             case IX86_BUILTIN_XSAVEOPT:
   15698           19 :               icode = CODE_FOR_xsaveopt_rex64;
   15699           19 :               break;
   15700           19 :             case IX86_BUILTIN_XSAVEOPT64:
   15701           19 :               icode = CODE_FOR_xsaveopt64;
   15702           19 :               break;
   15703           19 :             case IX86_BUILTIN_XSAVES:
   15704           19 :               icode = CODE_FOR_xsaves_rex64;
   15705           19 :               break;
   15706           19 :             case IX86_BUILTIN_XRSTORS:
   15707           19 :               icode = CODE_FOR_xrstors_rex64;
   15708           19 :               break;
   15709           19 :             case IX86_BUILTIN_XSAVES64:
   15710           19 :               icode = CODE_FOR_xsaves64;
   15711           19 :               break;
   15712           19 :             case IX86_BUILTIN_XRSTORS64:
   15713           19 :               icode = CODE_FOR_xrstors64;
   15714           19 :               break;
   15715           19 :             case IX86_BUILTIN_XSAVEC:
   15716           19 :               icode = CODE_FOR_xsavec_rex64;
   15717           19 :               break;
   15718           19 :             case IX86_BUILTIN_XSAVEC64:
   15719           19 :               icode = CODE_FOR_xsavec64;
   15720           19 :               break;
   15721            0 :             default:
   15722            0 :               gcc_unreachable ();
   15723              :             }
   15724              : 
   15725          232 :           op2 = gen_lowpart (SImode, op2);
   15726          232 :           op1 = gen_lowpart (SImode, op1);
   15727          232 :           pat = GEN_FCN (icode) (op0, op1, op2);
   15728              :         }
   15729              :       else
   15730              :         {
   15731            0 :           switch (fcode)
   15732              :             {
   15733              :             case IX86_BUILTIN_XSAVE:
   15734              :               icode = CODE_FOR_xsave;
   15735              :               break;
   15736              :             case IX86_BUILTIN_XRSTOR:
   15737              :               icode = CODE_FOR_xrstor;
   15738              :               break;
   15739              :             case IX86_BUILTIN_XSAVEOPT:
   15740              :               icode = CODE_FOR_xsaveopt;
   15741              :               break;
   15742              :             case IX86_BUILTIN_XSAVES:
   15743              :               icode = CODE_FOR_xsaves;
   15744              :               break;
   15745              :             case IX86_BUILTIN_XRSTORS:
   15746              :               icode = CODE_FOR_xrstors;
   15747              :               break;
   15748              :             case IX86_BUILTIN_XSAVEC:
   15749              :               icode = CODE_FOR_xsavec;
   15750              :               break;
   15751            0 :             default:
   15752            0 :               gcc_unreachable ();
   15753              :             }
   15754            0 :           pat = GEN_FCN (icode) (op0, op1);
   15755              :         }
   15756              : 
   15757          232 :       if (pat)
   15758          232 :         emit_insn (pat);
   15759              :       return 0;
   15760              : 
   15761          144 :     case IX86_BUILTIN_LDTILECFG:
   15762          144 :     case IX86_BUILTIN_STTILECFG:
   15763          144 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15764          144 :       op0 = expand_normal (arg0);
   15765              : 
   15766          144 :       if (!address_operand (op0, VOIDmode))
   15767              :         {
   15768            8 :           op0 = convert_memory_address (Pmode, op0);
   15769            8 :           op0 = copy_addr_to_reg (op0);
   15770              :         }
   15771          144 :       op0 = gen_rtx_MEM (BLKmode, op0);
   15772          144 :       if (fcode == IX86_BUILTIN_LDTILECFG)
   15773              :         icode = CODE_FOR_ldtilecfg;
   15774              :       else
   15775           93 :         icode = CODE_FOR_sttilecfg;
   15776          144 :       pat = GEN_FCN (icode) (op0);
   15777          144 :       emit_insn (pat);
   15778          144 :       return 0;
   15779              : 
   15780           18 :     case IX86_BUILTIN_LLWPCB:
   15781           18 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15782           18 :       op0 = expand_normal (arg0);
   15783              : 
   15784           18 :       if (!register_operand (op0, Pmode))
   15785            9 :         op0 = ix86_zero_extend_to_Pmode (op0);
   15786           18 :       emit_insn (gen_lwp_llwpcb (Pmode, op0));
   15787           18 :       return 0;
   15788              : 
   15789           18 :     case IX86_BUILTIN_SLWPCB:
   15790           18 :       if (!target
   15791           18 :           || !register_operand (target, Pmode))
   15792            0 :         target = gen_reg_rtx (Pmode);
   15793           18 :       emit_insn (gen_lwp_slwpcb (Pmode, target));
   15794           18 :       return target;
   15795              : 
   15796           51 :     case IX86_BUILTIN_LWPVAL32:
   15797           51 :     case IX86_BUILTIN_LWPVAL64:
   15798           51 :     case IX86_BUILTIN_LWPINS32:
   15799           51 :     case IX86_BUILTIN_LWPINS64:
   15800           51 :       mode = ((fcode == IX86_BUILTIN_LWPVAL32
   15801           51 :                || fcode == IX86_BUILTIN_LWPINS32)
   15802           51 :               ? SImode : DImode);
   15803              : 
   15804           51 :       if (fcode == IX86_BUILTIN_LWPVAL32
   15805           51 :           || fcode == IX86_BUILTIN_LWPVAL64)
   15806           26 :         icode = code_for_lwp_lwpval (mode);
   15807              :       else
   15808           25 :         icode = code_for_lwp_lwpins (mode);
   15809              : 
   15810           51 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15811           51 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15812           51 :       arg2 = CALL_EXPR_ARG (exp, 2);
   15813           51 :       op0 = expand_normal (arg0);
   15814           51 :       op1 = expand_normal (arg1);
   15815           51 :       op2 = expand_normal (arg2);
   15816           51 :       mode0 = insn_data[icode].operand[0].mode;
   15817              : 
   15818           51 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   15819           13 :         op0 = copy_to_mode_reg (mode0, op0);
   15820           51 :       if (!insn_data[icode].operand[1].predicate (op1, SImode))
   15821            0 :         op1 = copy_to_mode_reg (SImode, op1);
   15822              : 
   15823           51 :       if (!CONST_INT_P (op2))
   15824              :         {
   15825            0 :           error ("the last argument must be a 32-bit immediate");
   15826            0 :           return const0_rtx;
   15827              :         }
   15828              : 
   15829           51 :       emit_insn (GEN_FCN (icode) (op0, op1, op2));
   15830              : 
   15831           51 :       if (fcode == IX86_BUILTIN_LWPINS32
   15832           51 :           || fcode == IX86_BUILTIN_LWPINS64)
   15833              :         {
   15834           25 :           if (target == 0
   15835           25 :               || !nonimmediate_operand (target, QImode))
   15836            0 :             target = gen_reg_rtx (QImode);
   15837              : 
   15838           25 :           pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   15839              :                             const0_rtx);
   15840           25 :           emit_insn (gen_rtx_SET (target, pat));
   15841              : 
   15842           25 :           return target;
   15843              :         }
   15844              :       else
   15845              :         return 0;
   15846              : 
   15847           18 :     case IX86_BUILTIN_BEXTRI32:
   15848           18 :     case IX86_BUILTIN_BEXTRI64:
   15849           18 :       mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
   15850              : 
   15851           18 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15852           18 :       arg1 = CALL_EXPR_ARG (exp, 1);
   15853           18 :       op0 = expand_normal (arg0);
   15854           18 :       op1 = expand_normal (arg1);
   15855              : 
   15856           18 :       if (!CONST_INT_P (op1))
   15857              :         {
   15858            0 :           error ("last argument must be an immediate");
   15859            0 :           return const0_rtx;
   15860              :         }
   15861              :       else
   15862              :         {
   15863           18 :           unsigned char lsb_index = UINTVAL (op1);
   15864           18 :           unsigned char length = UINTVAL (op1) >> 8;
   15865              : 
   15866           18 :           unsigned char bitsize = GET_MODE_BITSIZE (mode);
   15867              : 
   15868           18 :           icode = code_for_tbm_bextri (mode);
   15869              : 
   15870           18 :           mode1 = insn_data[icode].operand[1].mode;
   15871           18 :           if (!insn_data[icode].operand[1].predicate (op0, mode1))
   15872           12 :             op0 = copy_to_mode_reg (mode1, op0);
   15873              : 
   15874           18 :           mode0 = insn_data[icode].operand[0].mode;
   15875           18 :           if (target == 0
   15876           18 :               || !register_operand (target, mode0))
   15877            0 :             target = gen_reg_rtx (mode0);
   15878              : 
   15879           18 :           if (length == 0 || lsb_index >= bitsize)
   15880              :             {
   15881            8 :               emit_move_insn (target, const0_rtx);
   15882            8 :               return target;
   15883              :             }
   15884              : 
   15885           10 :           if (length + lsb_index > bitsize)
   15886            5 :             length = bitsize - lsb_index;
   15887              : 
   15888           10 :           op1 = GEN_INT (length);
   15889           10 :           op2 = GEN_INT (lsb_index);
   15890              : 
   15891           10 :           emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
   15892           10 :           return target;
   15893              :         }
   15894              : 
   15895           21 :     case IX86_BUILTIN_RDRAND16_STEP:
   15896           21 :       mode = HImode;
   15897           21 :       goto rdrand_step;
   15898              : 
   15899           42 :     case IX86_BUILTIN_RDRAND32_STEP:
   15900           42 :       mode = SImode;
   15901           42 :       goto rdrand_step;
   15902              : 
   15903              :     case IX86_BUILTIN_RDRAND64_STEP:
   15904              :       mode = DImode;
   15905              : 
   15906           83 : rdrand_step:
   15907           83 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15908           83 :       op1 = expand_normal (arg0);
   15909           83 :       if (!address_operand (op1, VOIDmode))
   15910              :         {
   15911           29 :           op1 = convert_memory_address (Pmode, op1);
   15912           29 :           op1 = copy_addr_to_reg (op1);
   15913              :         }
   15914              : 
   15915           83 :       op0 = gen_reg_rtx (mode);
   15916           83 :       emit_insn (gen_rdrand (mode, op0));
   15917              : 
   15918           83 :       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
   15919              : 
   15920           83 :       op1 = force_reg (SImode, const1_rtx);
   15921              : 
   15922              :       /* Emit SImode conditional move.  */
   15923           83 :       if (mode == HImode)
   15924              :         {
   15925           21 :           if (TARGET_ZERO_EXTEND_WITH_AND
   15926           21 :               && optimize_function_for_speed_p (cfun))
   15927              :             {
   15928            0 :               op2 = force_reg (SImode, const0_rtx);
   15929              : 
   15930            0 :               emit_insn (gen_movstricthi
   15931            0 :                          (gen_lowpart (HImode, op2), op0));
   15932              :             }
   15933              :           else
   15934              :             {
   15935           21 :               op2 = gen_reg_rtx (SImode);
   15936              : 
   15937           21 :               emit_insn (gen_zero_extendhisi2 (op2, op0));
   15938              :             }
   15939              :         }
   15940           62 :       else if (mode == SImode)
   15941              :         op2 = op0;
   15942              :       else
   15943           20 :         op2 = gen_rtx_SUBREG (SImode, op0, 0);
   15944              : 
   15945           83 :       if (target == 0
   15946           83 :           || !register_operand (target, SImode))
   15947            7 :         target = gen_reg_rtx (SImode);
   15948              : 
   15949           83 :       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
   15950              :                          const0_rtx);
   15951           83 :       emit_insn (gen_rtx_SET (target,
   15952              :                               gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
   15953           83 :       return target;
   15954              : 
   15955           19 :     case IX86_BUILTIN_RDSEED16_STEP:
   15956           19 :       mode = HImode;
   15957           19 :       goto rdseed_step;
   15958              : 
   15959           28 :     case IX86_BUILTIN_RDSEED32_STEP:
   15960           28 :       mode = SImode;
   15961           28 :       goto rdseed_step;
   15962              : 
   15963              :     case IX86_BUILTIN_RDSEED64_STEP:
   15964              :       mode = DImode;
   15965              : 
   15966           66 : rdseed_step:
   15967           66 :       arg0 = CALL_EXPR_ARG (exp, 0);
   15968           66 :       op1 = expand_normal (arg0);
   15969           66 :       if (!address_operand (op1, VOIDmode))
   15970              :         {
   15971           28 :           op1 = convert_memory_address (Pmode, op1);
   15972           28 :           op1 = copy_addr_to_reg (op1);
   15973              :         }
   15974              : 
   15975           66 :       op0 = gen_reg_rtx (mode);
   15976           66 :       emit_insn (gen_rdseed (mode, op0));
   15977              : 
   15978           66 :       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
   15979              : 
   15980           66 :       op2 = gen_reg_rtx (QImode);
   15981              : 
   15982           66 :       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   15983              :                          const0_rtx);
   15984           66 :       emit_insn (gen_rtx_SET (op2, pat));
   15985              : 
   15986           66 :       if (target == 0
   15987           66 :           || !register_operand (target, SImode))
   15988            1 :         target = gen_reg_rtx (SImode);
   15989              : 
   15990           66 :       emit_insn (gen_zero_extendqisi2 (target, op2));
   15991           66 :       return target;
   15992              : 
   15993           38 :     case IX86_BUILTIN_SBB32:
   15994           38 :       icode = CODE_FOR_subborrowsi;
   15995           38 :       icode2 = CODE_FOR_subborrowsi_0;
   15996           38 :       mode0 = SImode;
   15997           38 :       mode1 = DImode;
   15998           38 :       mode2 = CCmode;
   15999           38 :       goto handlecarry;
   16000              : 
   16001           44 :     case IX86_BUILTIN_SBB64:
   16002           44 :       icode = CODE_FOR_subborrowdi;
   16003           44 :       icode2 = CODE_FOR_subborrowdi_0;
   16004           44 :       mode0 = DImode;
   16005           44 :       mode1 = TImode;
   16006           44 :       mode2 = CCmode;
   16007           44 :       goto handlecarry;
   16008              : 
   16009           68 :     case IX86_BUILTIN_ADDCARRYX32:
   16010           68 :       icode = CODE_FOR_addcarrysi;
   16011           68 :       icode2 = CODE_FOR_addcarrysi_0;
   16012           68 :       mode0 = SImode;
   16013           68 :       mode1 = DImode;
   16014           68 :       mode2 = CCCmode;
   16015           68 :       goto handlecarry;
   16016              : 
   16017              :     case IX86_BUILTIN_ADDCARRYX64:
   16018              :       icode = CODE_FOR_addcarrydi;
   16019              :       icode2 = CODE_FOR_addcarrydi_0;
   16020              :       mode0 = DImode;
   16021              :       mode1 = TImode;
   16022              :       mode2 = CCCmode;
   16023              : 
   16024          212 :     handlecarry:
   16025          212 :       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
   16026          212 :       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
   16027          212 :       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
   16028          212 :       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
   16029              : 
   16030          212 :       op1 = expand_normal (arg0);
   16031              : 
   16032          212 :       op2 = expand_normal (arg1);
   16033          212 :       if (!register_operand (op2, mode0))
   16034          117 :         op2 = copy_to_mode_reg (mode0, op2);
   16035              : 
   16036          212 :       op3 = expand_normal (arg2);
   16037          212 :       if (!register_operand (op3, mode0))
   16038          120 :         op3 = copy_to_mode_reg (mode0, op3);
   16039              : 
   16040          212 :       op4 = expand_normal (arg3);
   16041          212 :       if (!address_operand (op4, VOIDmode))
   16042              :         {
   16043           67 :           op4 = convert_memory_address (Pmode, op4);
   16044           67 :           op4 = copy_addr_to_reg (op4);
   16045              :         }
   16046              : 
   16047          212 :       op0 = gen_reg_rtx (mode0);
   16048          212 :       if (op1 == const0_rtx)
   16049              :         {
   16050              :           /* If arg0 is 0, optimize right away into add or sub
   16051              :              instruction that sets CCCmode flags.  */
   16052           21 :           op1 = gen_rtx_REG (mode2, FLAGS_REG);
   16053           21 :           emit_insn (GEN_FCN (icode2) (op0, op2, op3));
   16054              :         }
   16055              :       else
   16056              :         {
   16057              :           /* Generate CF from input operand.  */
   16058          191 :           ix86_expand_carry (op1);
   16059              : 
   16060              :           /* Generate instruction that consumes CF.  */
   16061          191 :           op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
   16062          191 :           pat = gen_rtx_LTU (mode1, op1, const0_rtx);
   16063          191 :           pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
   16064          191 :           emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
   16065              :         }
   16066              : 
   16067              :       /* Return current CF value.  */
   16068          212 :       if (target == 0)
   16069           14 :         target = gen_reg_rtx (QImode);
   16070              : 
   16071          212 :       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
   16072          212 :       emit_insn (gen_rtx_SET (target, pat));
   16073              : 
   16074              :       /* Store the result.  */
   16075          212 :       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
   16076              : 
   16077          212 :       return target;
   16078              : 
   16079           24 :     case IX86_BUILTIN_READ_FLAGS:
   16080           24 :       if (ignore)
   16081            1 :         return const0_rtx;
   16082              : 
   16083           23 :       emit_insn (gen_pushfl ());
   16084              : 
   16085           23 :       if (optimize
   16086           11 :           || target == NULL_RTX
   16087           11 :           || !nonimmediate_operand (target, word_mode)
   16088           34 :           || GET_MODE (target) != word_mode)
   16089           12 :         target = gen_reg_rtx (word_mode);
   16090              : 
   16091           23 :       emit_insn (gen_pop (target));
   16092           23 :       return target;
   16093              : 
   16094           21 :     case IX86_BUILTIN_WRITE_FLAGS:
   16095              : 
   16096           21 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16097           21 :       op0 = expand_normal (arg0);
   16098           21 :       if (!general_no_elim_operand (op0, word_mode))
   16099            0 :         op0 = copy_to_mode_reg (word_mode, op0);
   16100              : 
   16101           21 :       emit_insn (gen_push (op0));
   16102           21 :       emit_insn (gen_popfl ());
   16103           21 :       return 0;
   16104              : 
   16105           22 :     case IX86_BUILTIN_KTESTC8:
   16106           22 :       icode = CODE_FOR_ktestqi;
   16107           22 :       mode3 = CCCmode;
   16108           22 :       goto kortest;
   16109              : 
   16110           22 :     case IX86_BUILTIN_KTESTZ8:
   16111           22 :       icode = CODE_FOR_ktestqi;
   16112           22 :       mode3 = CCZmode;
   16113           22 :       goto kortest;
   16114              : 
   16115           22 :     case IX86_BUILTIN_KTESTC16:
   16116           22 :       icode = CODE_FOR_ktesthi;
   16117           22 :       mode3 = CCCmode;
   16118           22 :       goto kortest;
   16119              : 
   16120           22 :     case IX86_BUILTIN_KTESTZ16:
   16121           22 :       icode = CODE_FOR_ktesthi;
   16122           22 :       mode3 = CCZmode;
   16123           22 :       goto kortest;
   16124              : 
   16125           22 :     case IX86_BUILTIN_KTESTC32:
   16126           22 :       icode = CODE_FOR_ktestsi;
   16127           22 :       mode3 = CCCmode;
   16128           22 :       goto kortest;
   16129              : 
   16130           22 :     case IX86_BUILTIN_KTESTZ32:
   16131           22 :       icode = CODE_FOR_ktestsi;
   16132           22 :       mode3 = CCZmode;
   16133           22 :       goto kortest;
   16134              : 
   16135           22 :     case IX86_BUILTIN_KTESTC64:
   16136           22 :       icode = CODE_FOR_ktestdi;
   16137           22 :       mode3 = CCCmode;
   16138           22 :       goto kortest;
   16139              : 
   16140           22 :     case IX86_BUILTIN_KTESTZ64:
   16141           22 :       icode = CODE_FOR_ktestdi;
   16142           22 :       mode3 = CCZmode;
   16143           22 :       goto kortest;
   16144              : 
   16145           22 :     case IX86_BUILTIN_KORTESTC8:
   16146           22 :       icode = CODE_FOR_kortestqi;
   16147           22 :       mode3 = CCCmode;
   16148           22 :       goto kortest;
   16149              : 
   16150           76 :     case IX86_BUILTIN_KORTESTZ8:
   16151           76 :       icode = CODE_FOR_kortestqi;
   16152           76 :       mode3 = CCZmode;
   16153           76 :       goto kortest;
   16154              : 
   16155           38 :     case IX86_BUILTIN_KORTESTC16:
   16156           38 :       icode = CODE_FOR_kortesthi;
   16157           38 :       mode3 = CCCmode;
   16158           38 :       goto kortest;
   16159              : 
   16160           91 :     case IX86_BUILTIN_KORTESTZ16:
   16161           91 :       icode = CODE_FOR_kortesthi;
   16162           91 :       mode3 = CCZmode;
   16163           91 :       goto kortest;
   16164              : 
   16165           22 :     case IX86_BUILTIN_KORTESTC32:
   16166           22 :       icode = CODE_FOR_kortestsi;
   16167           22 :       mode3 = CCCmode;
   16168           22 :       goto kortest;
   16169              : 
   16170           79 :     case IX86_BUILTIN_KORTESTZ32:
   16171           79 :       icode = CODE_FOR_kortestsi;
   16172           79 :       mode3 = CCZmode;
   16173           79 :       goto kortest;
   16174              : 
   16175           22 :     case IX86_BUILTIN_KORTESTC64:
   16176           22 :       icode = CODE_FOR_kortestdi;
   16177           22 :       mode3 = CCCmode;
   16178           22 :       goto kortest;
   16179              : 
   16180              :     case IX86_BUILTIN_KORTESTZ64:
   16181              :       icode = CODE_FOR_kortestdi;
   16182              :       mode3 = CCZmode;
   16183              : 
   16184          610 :     kortest:
   16185          610 :       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
   16186          610 :       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
   16187          610 :       op0 = expand_normal (arg0);
   16188          610 :       op1 = expand_normal (arg1);
   16189              : 
   16190          610 :       mode0 = insn_data[icode].operand[0].mode;
   16191          610 :       mode1 = insn_data[icode].operand[1].mode;
   16192              : 
   16193          610 :       if (GET_MODE (op0) != VOIDmode)
   16194          610 :         op0 = force_reg (GET_MODE (op0), op0);
   16195              : 
   16196          610 :       op0 = gen_lowpart (mode0, op0);
   16197              : 
   16198          610 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   16199            0 :         op0 = copy_to_mode_reg (mode0, op0);
   16200              : 
   16201          610 :       if (GET_MODE (op1) != VOIDmode)
   16202          609 :         op1 = force_reg (GET_MODE (op1), op1);
   16203              : 
   16204          610 :       op1 = gen_lowpart (mode1, op1);
   16205              : 
   16206          610 :       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   16207            1 :         op1 = copy_to_mode_reg (mode1, op1);
   16208              : 
   16209          610 :       target = gen_reg_rtx (QImode);
   16210              : 
   16211              :       /* Emit kortest.  */
   16212          610 :       emit_insn (GEN_FCN (icode) (op0, op1));
   16213              :       /* And use setcc to return result from flags.  */
   16214          610 :       ix86_expand_setcc (target, EQ,
   16215              :                          gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
   16216          610 :       return target;
   16217              : 
   16218           24 :     case IX86_BUILTIN_GATHERSIV2DF:
   16219           24 :       icode = CODE_FOR_avx2_gathersiv2df;
   16220           24 :       goto gather_gen;
   16221           18 :     case IX86_BUILTIN_GATHERSIV4DF:
   16222           18 :       icode = CODE_FOR_avx2_gathersiv4df;
   16223           18 :       goto gather_gen;
   16224           21 :     case IX86_BUILTIN_GATHERDIV2DF:
   16225           21 :       icode = CODE_FOR_avx2_gatherdiv2df;
   16226           21 :       goto gather_gen;
   16227           32 :     case IX86_BUILTIN_GATHERDIV4DF:
   16228           32 :       icode = CODE_FOR_avx2_gatherdiv4df;
   16229           32 :       goto gather_gen;
   16230           30 :     case IX86_BUILTIN_GATHERSIV4SF:
   16231           30 :       icode = CODE_FOR_avx2_gathersiv4sf;
   16232           30 :       goto gather_gen;
   16233           37 :     case IX86_BUILTIN_GATHERSIV8SF:
   16234           37 :       icode = CODE_FOR_avx2_gathersiv8sf;
   16235           37 :       goto gather_gen;
   16236           24 :     case IX86_BUILTIN_GATHERDIV4SF:
   16237           24 :       icode = CODE_FOR_avx2_gatherdiv4sf;
   16238           24 :       goto gather_gen;
   16239           18 :     case IX86_BUILTIN_GATHERDIV8SF:
   16240           18 :       icode = CODE_FOR_avx2_gatherdiv8sf;
   16241           18 :       goto gather_gen;
   16242           18 :     case IX86_BUILTIN_GATHERSIV2DI:
   16243           18 :       icode = CODE_FOR_avx2_gathersiv2di;
   16244           18 :       goto gather_gen;
   16245           18 :     case IX86_BUILTIN_GATHERSIV4DI:
   16246           18 :       icode = CODE_FOR_avx2_gathersiv4di;
   16247           18 :       goto gather_gen;
   16248           27 :     case IX86_BUILTIN_GATHERDIV2DI:
   16249           27 :       icode = CODE_FOR_avx2_gatherdiv2di;
   16250           27 :       goto gather_gen;
   16251           29 :     case IX86_BUILTIN_GATHERDIV4DI:
   16252           29 :       icode = CODE_FOR_avx2_gatherdiv4di;
   16253           29 :       goto gather_gen;
   16254           20 :     case IX86_BUILTIN_GATHERSIV4SI:
   16255           20 :       icode = CODE_FOR_avx2_gathersiv4si;
   16256           20 :       goto gather_gen;
   16257           22 :     case IX86_BUILTIN_GATHERSIV8SI:
   16258           22 :       icode = CODE_FOR_avx2_gathersiv8si;
   16259           22 :       goto gather_gen;
   16260           28 :     case IX86_BUILTIN_GATHERDIV4SI:
   16261           28 :       icode = CODE_FOR_avx2_gatherdiv4si;
   16262           28 :       goto gather_gen;
   16263           18 :     case IX86_BUILTIN_GATHERDIV8SI:
   16264           18 :       icode = CODE_FOR_avx2_gatherdiv8si;
   16265           18 :       goto gather_gen;
   16266           20 :     case IX86_BUILTIN_GATHERALTSIV4DF:
   16267           20 :       icode = CODE_FOR_avx2_gathersiv4df;
   16268           20 :       goto gather_gen;
   16269           16 :     case IX86_BUILTIN_GATHERALTDIV8SF:
   16270           16 :       icode = CODE_FOR_avx2_gatherdiv8sf;
   16271           16 :       goto gather_gen;
   16272            4 :     case IX86_BUILTIN_GATHERALTSIV4DI:
   16273            4 :       icode = CODE_FOR_avx2_gathersiv4di;
   16274            4 :       goto gather_gen;
   16275           12 :     case IX86_BUILTIN_GATHERALTDIV8SI:
   16276           12 :       icode = CODE_FOR_avx2_gatherdiv8si;
   16277           12 :       goto gather_gen;
   16278           36 :     case IX86_BUILTIN_GATHER3SIV16SF:
   16279           36 :       icode = CODE_FOR_avx512f_gathersiv16sf;
   16280           36 :       goto gather_gen;
   16281           24 :     case IX86_BUILTIN_GATHER3SIV8DF:
   16282           24 :       icode = CODE_FOR_avx512f_gathersiv8df;
   16283           24 :       goto gather_gen;
   16284           24 :     case IX86_BUILTIN_GATHER3DIV16SF:
   16285           24 :       icode = CODE_FOR_avx512f_gatherdiv16sf;
   16286           24 :       goto gather_gen;
   16287           37 :     case IX86_BUILTIN_GATHER3DIV8DF:
   16288           37 :       icode = CODE_FOR_avx512f_gatherdiv8df;
   16289           37 :       goto gather_gen;
   16290           30 :     case IX86_BUILTIN_GATHER3SIV16SI:
   16291           30 :       icode = CODE_FOR_avx512f_gathersiv16si;
   16292           30 :       goto gather_gen;
   16293           24 :     case IX86_BUILTIN_GATHER3SIV8DI:
   16294           24 :       icode = CODE_FOR_avx512f_gathersiv8di;
   16295           24 :       goto gather_gen;
   16296           24 :     case IX86_BUILTIN_GATHER3DIV16SI:
   16297           24 :       icode = CODE_FOR_avx512f_gatherdiv16si;
   16298           24 :       goto gather_gen;
   16299           37 :     case IX86_BUILTIN_GATHER3DIV8DI:
   16300           37 :       icode = CODE_FOR_avx512f_gatherdiv8di;
   16301           37 :       goto gather_gen;
   16302           16 :     case IX86_BUILTIN_GATHER3ALTSIV8DF:
   16303           16 :       icode = CODE_FOR_avx512f_gathersiv8df;
   16304           16 :       goto gather_gen;
   16305           22 :     case IX86_BUILTIN_GATHER3ALTDIV16SF:
   16306           22 :       icode = CODE_FOR_avx512f_gatherdiv16sf;
   16307           22 :       goto gather_gen;
   16308           14 :     case IX86_BUILTIN_GATHER3ALTSIV8DI:
   16309           14 :       icode = CODE_FOR_avx512f_gathersiv8di;
   16310           14 :       goto gather_gen;
   16311           18 :     case IX86_BUILTIN_GATHER3ALTDIV16SI:
   16312           18 :       icode = CODE_FOR_avx512f_gatherdiv16si;
   16313           18 :       goto gather_gen;
   16314           18 :     case IX86_BUILTIN_GATHER3SIV2DF:
   16315           18 :       icode = CODE_FOR_avx512vl_gathersiv2df;
   16316           18 :       goto gather_gen;
   16317           10 :     case IX86_BUILTIN_GATHER3SIV4DF:
   16318           10 :       icode = CODE_FOR_avx512vl_gathersiv4df;
   16319           10 :       goto gather_gen;
   16320           15 :     case IX86_BUILTIN_GATHER3DIV2DF:
   16321           15 :       icode = CODE_FOR_avx512vl_gatherdiv2df;
   16322           15 :       goto gather_gen;
   16323           16 :     case IX86_BUILTIN_GATHER3DIV4DF:
   16324           16 :       icode = CODE_FOR_avx512vl_gatherdiv4df;
   16325           16 :       goto gather_gen;
   16326           14 :     case IX86_BUILTIN_GATHER3SIV4SF:
   16327           14 :       icode = CODE_FOR_avx512vl_gathersiv4sf;
   16328           14 :       goto gather_gen;
   16329           12 :     case IX86_BUILTIN_GATHER3SIV8SF:
   16330           12 :       icode = CODE_FOR_avx512vl_gathersiv8sf;
   16331           12 :       goto gather_gen;
   16332           22 :     case IX86_BUILTIN_GATHER3DIV4SF:
   16333           22 :       icode = CODE_FOR_avx512vl_gatherdiv4sf;
   16334           22 :       goto gather_gen;
   16335           10 :     case IX86_BUILTIN_GATHER3DIV8SF:
   16336           10 :       icode = CODE_FOR_avx512vl_gatherdiv8sf;
   16337           10 :       goto gather_gen;
   16338           20 :     case IX86_BUILTIN_GATHER3SIV2DI:
   16339           20 :       icode = CODE_FOR_avx512vl_gathersiv2di;
   16340           20 :       goto gather_gen;
   16341           10 :     case IX86_BUILTIN_GATHER3SIV4DI:
   16342           10 :       icode = CODE_FOR_avx512vl_gathersiv4di;
   16343           10 :       goto gather_gen;
   16344           14 :     case IX86_BUILTIN_GATHER3DIV2DI:
   16345           14 :       icode = CODE_FOR_avx512vl_gatherdiv2di;
   16346           14 :       goto gather_gen;
   16347           13 :     case IX86_BUILTIN_GATHER3DIV4DI:
   16348           13 :       icode = CODE_FOR_avx512vl_gatherdiv4di;
   16349           13 :       goto gather_gen;
   16350           14 :     case IX86_BUILTIN_GATHER3SIV4SI:
   16351           14 :       icode = CODE_FOR_avx512vl_gathersiv4si;
   16352           14 :       goto gather_gen;
   16353           12 :     case IX86_BUILTIN_GATHER3SIV8SI:
   16354           12 :       icode = CODE_FOR_avx512vl_gathersiv8si;
   16355           12 :       goto gather_gen;
   16356           24 :     case IX86_BUILTIN_GATHER3DIV4SI:
   16357           24 :       icode = CODE_FOR_avx512vl_gatherdiv4si;
   16358           24 :       goto gather_gen;
   16359           10 :     case IX86_BUILTIN_GATHER3DIV8SI:
   16360           10 :       icode = CODE_FOR_avx512vl_gatherdiv8si;
   16361           10 :       goto gather_gen;
   16362            4 :     case IX86_BUILTIN_GATHER3ALTSIV4DF:
   16363            4 :       icode = CODE_FOR_avx512vl_gathersiv4df;
   16364            4 :       goto gather_gen;
   16365            8 :     case IX86_BUILTIN_GATHER3ALTDIV8SF:
   16366            8 :       icode = CODE_FOR_avx512vl_gatherdiv8sf;
   16367            8 :       goto gather_gen;
   16368            6 :     case IX86_BUILTIN_GATHER3ALTSIV4DI:
   16369            6 :       icode = CODE_FOR_avx512vl_gathersiv4di;
   16370            6 :       goto gather_gen;
   16371           10 :     case IX86_BUILTIN_GATHER3ALTDIV8SI:
   16372           10 :       icode = CODE_FOR_avx512vl_gatherdiv8si;
   16373           10 :       goto gather_gen;
   16374           40 :     case IX86_BUILTIN_SCATTERSIV16SF:
   16375           40 :       icode = CODE_FOR_avx512f_scattersiv16sf;
   16376           40 :       goto scatter_gen;
   16377           27 :     case IX86_BUILTIN_SCATTERSIV8DF:
   16378           27 :       icode = CODE_FOR_avx512f_scattersiv8df;
   16379           27 :       goto scatter_gen;
   16380           24 :     case IX86_BUILTIN_SCATTERDIV16SF:
   16381           24 :       icode = CODE_FOR_avx512f_scatterdiv16sf;
   16382           24 :       goto scatter_gen;
   16383           33 :     case IX86_BUILTIN_SCATTERDIV8DF:
   16384           33 :       icode = CODE_FOR_avx512f_scatterdiv8df;
   16385           33 :       goto scatter_gen;
   16386           30 :     case IX86_BUILTIN_SCATTERSIV16SI:
   16387           30 :       icode = CODE_FOR_avx512f_scattersiv16si;
   16388           30 :       goto scatter_gen;
   16389           24 :     case IX86_BUILTIN_SCATTERSIV8DI:
   16390           24 :       icode = CODE_FOR_avx512f_scattersiv8di;
   16391           24 :       goto scatter_gen;
   16392           24 :     case IX86_BUILTIN_SCATTERDIV16SI:
   16393           24 :       icode = CODE_FOR_avx512f_scatterdiv16si;
   16394           24 :       goto scatter_gen;
   16395           29 :     case IX86_BUILTIN_SCATTERDIV8DI:
   16396           29 :       icode = CODE_FOR_avx512f_scatterdiv8di;
   16397           29 :       goto scatter_gen;
   16398           18 :     case IX86_BUILTIN_SCATTERSIV8SF:
   16399           18 :       icode = CODE_FOR_avx512vl_scattersiv8sf;
   16400           18 :       goto scatter_gen;
   16401           20 :     case IX86_BUILTIN_SCATTERSIV4SF:
   16402           20 :       icode = CODE_FOR_avx512vl_scattersiv4sf;
   16403           20 :       goto scatter_gen;
   16404           16 :     case IX86_BUILTIN_SCATTERSIV4DF:
   16405           16 :       icode = CODE_FOR_avx512vl_scattersiv4df;
   16406           16 :       goto scatter_gen;
   16407           16 :     case IX86_BUILTIN_SCATTERSIV2DF:
   16408           16 :       icode = CODE_FOR_avx512vl_scattersiv2df;
   16409           16 :       goto scatter_gen;
   16410           16 :     case IX86_BUILTIN_SCATTERDIV8SF:
   16411           16 :       icode = CODE_FOR_avx512vl_scatterdiv8sf;
   16412           16 :       goto scatter_gen;
   16413           16 :     case IX86_BUILTIN_SCATTERDIV4SF:
   16414           16 :       icode = CODE_FOR_avx512vl_scatterdiv4sf;
   16415           16 :       goto scatter_gen;
   16416           18 :     case IX86_BUILTIN_SCATTERDIV4DF:
   16417           18 :       icode = CODE_FOR_avx512vl_scatterdiv4df;
   16418           18 :       goto scatter_gen;
   16419           18 :     case IX86_BUILTIN_SCATTERDIV2DF:
   16420           18 :       icode = CODE_FOR_avx512vl_scatterdiv2df;
   16421           18 :       goto scatter_gen;
   16422           22 :     case IX86_BUILTIN_SCATTERSIV8SI:
   16423           22 :       icode = CODE_FOR_avx512vl_scattersiv8si;
   16424           22 :       goto scatter_gen;
   16425           24 :     case IX86_BUILTIN_SCATTERSIV4SI:
   16426           24 :       icode = CODE_FOR_avx512vl_scattersiv4si;
   16427           24 :       goto scatter_gen;
   16428           16 :     case IX86_BUILTIN_SCATTERSIV4DI:
   16429           16 :       icode = CODE_FOR_avx512vl_scattersiv4di;
   16430           16 :       goto scatter_gen;
   16431           16 :     case IX86_BUILTIN_SCATTERSIV2DI:
   16432           16 :       icode = CODE_FOR_avx512vl_scattersiv2di;
   16433           16 :       goto scatter_gen;
   16434           16 :     case IX86_BUILTIN_SCATTERDIV8SI:
   16435           16 :       icode = CODE_FOR_avx512vl_scatterdiv8si;
   16436           16 :       goto scatter_gen;
   16437           16 :     case IX86_BUILTIN_SCATTERDIV4SI:
   16438           16 :       icode = CODE_FOR_avx512vl_scatterdiv4si;
   16439           16 :       goto scatter_gen;
   16440           18 :     case IX86_BUILTIN_SCATTERDIV4DI:
   16441           18 :       icode = CODE_FOR_avx512vl_scatterdiv4di;
   16442           18 :       goto scatter_gen;
   16443           18 :     case IX86_BUILTIN_SCATTERDIV2DI:
   16444           18 :       icode = CODE_FOR_avx512vl_scatterdiv2di;
   16445           18 :       goto scatter_gen;
   16446           16 :     case IX86_BUILTIN_SCATTERALTSIV8DF:
   16447           16 :       icode = CODE_FOR_avx512f_scattersiv8df;
   16448           16 :       goto scatter_gen;
   16449           12 :     case IX86_BUILTIN_SCATTERALTDIV16SF:
   16450           12 :       icode = CODE_FOR_avx512f_scatterdiv16sf;
   16451           12 :       goto scatter_gen;
   16452            8 :     case IX86_BUILTIN_SCATTERALTSIV8DI:
   16453            8 :       icode = CODE_FOR_avx512f_scattersiv8di;
   16454            8 :       goto scatter_gen;
   16455           24 :     case IX86_BUILTIN_SCATTERALTDIV16SI:
   16456           24 :       icode = CODE_FOR_avx512f_scatterdiv16si;
   16457           24 :       goto scatter_gen;
   16458            4 :     case IX86_BUILTIN_SCATTERALTSIV4DF:
   16459            4 :       icode = CODE_FOR_avx512vl_scattersiv4df;
   16460            4 :       goto scatter_gen;
   16461            4 :     case IX86_BUILTIN_SCATTERALTDIV8SF:
   16462            4 :       icode = CODE_FOR_avx512vl_scatterdiv8sf;
   16463            4 :       goto scatter_gen;
   16464            4 :     case IX86_BUILTIN_SCATTERALTSIV4DI:
   16465            4 :       icode = CODE_FOR_avx512vl_scattersiv4di;
   16466            4 :       goto scatter_gen;
   16467            4 :     case IX86_BUILTIN_SCATTERALTDIV8SI:
   16468            4 :       icode = CODE_FOR_avx512vl_scatterdiv8si;
   16469            4 :       goto scatter_gen;
   16470            8 :     case IX86_BUILTIN_SCATTERALTSIV2DF:
   16471            8 :       icode = CODE_FOR_avx512vl_scattersiv2df;
   16472            8 :       goto scatter_gen;
   16473            8 :     case IX86_BUILTIN_SCATTERALTDIV4SF:
   16474            8 :       icode = CODE_FOR_avx512vl_scatterdiv4sf;
   16475            8 :       goto scatter_gen;
   16476            8 :     case IX86_BUILTIN_SCATTERALTSIV2DI:
   16477            8 :       icode = CODE_FOR_avx512vl_scattersiv2di;
   16478            8 :       goto scatter_gen;
   16479            8 :     case IX86_BUILTIN_SCATTERALTDIV4SI:
   16480            8 :       icode = CODE_FOR_avx512vl_scatterdiv4si;
   16481            8 :       goto scatter_gen;
   16482              : 
   16483         1004 :     gather_gen:
   16484         1004 :       rtx half;
   16485         1004 :       rtx (*gen) (rtx, rtx);
   16486              : 
   16487         1004 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16488         1004 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16489         1004 :       arg2 = CALL_EXPR_ARG (exp, 2);
   16490         1004 :       arg3 = CALL_EXPR_ARG (exp, 3);
   16491         1004 :       arg4 = CALL_EXPR_ARG (exp, 4);
   16492         1004 :       op0 = expand_normal (arg0);
   16493         1004 :       op1 = expand_normal (arg1);
   16494         1004 :       op2 = expand_normal (arg2);
   16495         1004 :       op3 = ix86_expand_unsigned_small_int_cst_argument (arg3);
   16496         1004 :       op4 = expand_normal (arg4);
   16497              :       /* Note the arg order is different from the operand order.  */
   16498         1004 :       mode0 = insn_data[icode].operand[1].mode;
   16499         1004 :       mode2 = insn_data[icode].operand[3].mode;
   16500         1004 :       mode3 = insn_data[icode].operand[4].mode;
   16501         1004 :       mode4 = insn_data[icode].operand[5].mode;
   16502              : 
   16503         1004 :       if (target == NULL_RTX
   16504         1004 :           || GET_MODE (target) != insn_data[icode].operand[0].mode
   16505         1904 :           || !insn_data[icode].operand[0].predicate (target,
   16506              :                                                      GET_MODE (target)))
   16507          105 :         subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
   16508              :       else
   16509              :         subtarget = target;
   16510              : 
   16511         1004 :       switch (fcode)
   16512              :         {
   16513           30 :         case IX86_BUILTIN_GATHER3ALTSIV8DF:
   16514           30 :         case IX86_BUILTIN_GATHER3ALTSIV8DI:
   16515           30 :           half = gen_reg_rtx (V8SImode);
   16516           30 :           if (!nonimmediate_operand (op2, V16SImode))
   16517            0 :             op2 = copy_to_mode_reg (V16SImode, op2);
   16518           30 :           emit_insn (gen_vec_extract_lo_v16si (half, op2));
   16519           30 :           op2 = half;
   16520           30 :           break;
   16521           34 :         case IX86_BUILTIN_GATHER3ALTSIV4DF:
   16522           34 :         case IX86_BUILTIN_GATHER3ALTSIV4DI:
   16523           34 :         case IX86_BUILTIN_GATHERALTSIV4DF:
   16524           34 :         case IX86_BUILTIN_GATHERALTSIV4DI:
   16525           34 :           half = gen_reg_rtx (V4SImode);
   16526           34 :           if (!nonimmediate_operand (op2, V8SImode))
   16527            0 :             op2 = copy_to_mode_reg (V8SImode, op2);
   16528           34 :           emit_insn (gen_vec_extract_lo_v8si (half, op2));
   16529           34 :           op2 = half;
   16530           34 :           break;
   16531           40 :         case IX86_BUILTIN_GATHER3ALTDIV16SF:
   16532           40 :         case IX86_BUILTIN_GATHER3ALTDIV16SI:
   16533           40 :           half = gen_reg_rtx (mode0);
   16534           40 :           if (mode0 == V8SFmode)
   16535              :             gen = gen_vec_extract_lo_v16sf;
   16536              :           else
   16537           18 :             gen = gen_vec_extract_lo_v16si;
   16538           40 :           if (!nonimmediate_operand (op0, GET_MODE (op0)))
   16539           40 :             op0 = copy_to_mode_reg (GET_MODE (op0), op0);
   16540           40 :           emit_insn (gen (half, op0));
   16541           40 :           op0 = half;
   16542           40 :           op3 = lowpart_subreg (QImode, op3, HImode);
   16543           40 :           break;
   16544           46 :         case IX86_BUILTIN_GATHER3ALTDIV8SF:
   16545           46 :         case IX86_BUILTIN_GATHER3ALTDIV8SI:
   16546           46 :         case IX86_BUILTIN_GATHERALTDIV8SF:
   16547           46 :         case IX86_BUILTIN_GATHERALTDIV8SI:
   16548           46 :           half = gen_reg_rtx (mode0);
   16549           46 :           if (mode0 == V4SFmode)
   16550              :             gen = gen_vec_extract_lo_v8sf;
   16551              :           else
   16552           22 :             gen = gen_vec_extract_lo_v8si;
   16553           46 :           if (!nonimmediate_operand (op0, GET_MODE (op0)))
   16554           46 :             op0 = copy_to_mode_reg (GET_MODE (op0), op0);
   16555           46 :           emit_insn (gen (half, op0));
   16556           46 :           op0 = half;
   16557           46 :           if (VECTOR_MODE_P (GET_MODE (op3)))
   16558              :             {
   16559           28 :               half = gen_reg_rtx (mode0);
   16560           28 :               if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16561           12 :                 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16562           28 :               emit_insn (gen (half, op3));
   16563           28 :               op3 = half;
   16564              :             }
   16565              :           break;
   16566              :         default:
   16567              :           break;
   16568              :         }
   16569              : 
   16570              :       /* Force memory operand only with base register here.  But we
   16571              :          don't want to do it on memory operand for other builtin
   16572              :          functions.  */
   16573         1004 :       op1 = ix86_zero_extend_to_Pmode (op1);
   16574              : 
   16575         1004 :       if (!insn_data[icode].operand[1].predicate (op0, mode0))
   16576          403 :         op0 = copy_to_mode_reg (mode0, op0);
   16577         1009 :       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
   16578            0 :         op1 = copy_to_mode_reg (Pmode, op1);
   16579         1004 :       if (!insn_data[icode].operand[3].predicate (op2, mode2))
   16580          221 :         op2 = copy_to_mode_reg (mode2, op2);
   16581              : 
   16582         1004 :       op3 = fixup_modeless_constant (op3, mode3);
   16583              : 
   16584         1004 :       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
   16585              :         {
   16586         1004 :           if (!insn_data[icode].operand[4].predicate (op3, mode3))
   16587          356 :             op3 = copy_to_mode_reg (mode3, op3);
   16588              :         }
   16589              :       else
   16590              :         {
   16591            0 :           op3 = copy_to_reg (op3);
   16592            0 :           op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
   16593              :         }
   16594         1004 :       if (!insn_data[icode].operand[5].predicate (op4, mode4))
   16595              :         {
   16596            0 :           error ("the last argument must be scale 1, 2, 4, 8");
   16597            0 :           return const0_rtx;
   16598              :         }
   16599              : 
   16600              :       /* Optimize.  If mask is known to have all high bits set,
   16601              :          replace op0 with pc_rtx to signal that the instruction
   16602              :          overwrites the whole destination and doesn't use its
   16603              :          previous contents.  */
   16604         1004 :       if (optimize)
   16605              :         {
   16606          914 :           if (TREE_CODE (arg3) == INTEGER_CST)
   16607              :             {
   16608          209 :               if (integer_all_onesp (arg3))
   16609          201 :                 op0 = pc_rtx;
   16610              :             }
   16611          705 :           else if (TREE_CODE (arg3) == VECTOR_CST)
   16612              :             {
   16613              :               unsigned int negative = 0;
   16614          755 :               for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
   16615              :                 {
   16616          620 :                   tree cst = VECTOR_CST_ELT (arg3, i);
   16617          620 :                   if (TREE_CODE (cst) == INTEGER_CST
   16618          620 :                       && tree_int_cst_sign_bit (cst))
   16619          286 :                     negative++;
   16620          334 :                   else if (TREE_CODE (cst) == REAL_CST
   16621          334 :                            && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
   16622          306 :                     negative++;
   16623              :                 }
   16624          135 :               if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
   16625          121 :                 op0 = pc_rtx;
   16626              :             }
   16627          570 :           else if (TREE_CODE (arg3) == SSA_NAME
   16628          570 :                    && VECTOR_TYPE_P (TREE_TYPE (arg3)))
   16629              :             {
   16630              :               /* Recognize also when mask is like:
   16631              :                  __v2df src = _mm_setzero_pd ();
   16632              :                  __v2df mask = _mm_cmpeq_pd (src, src);
   16633              :                  or
   16634              :                  __v8sf src = _mm256_setzero_ps ();
   16635              :                  __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
   16636              :                  as that is a cheaper way to load all ones into
   16637              :                  a register than having to load a constant from
   16638              :                  memory.  */
   16639          259 :               gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
   16640          259 :               if (is_gimple_call (def_stmt))
   16641              :                 {
   16642           76 :                   tree fndecl = gimple_call_fndecl (def_stmt);
   16643           76 :                   if (fndecl
   16644           76 :                       && fndecl_built_in_p (fndecl, BUILT_IN_MD))
   16645           67 :                     switch (DECL_MD_FUNCTION_CODE (fndecl))
   16646              :                       {
   16647           24 :                       case IX86_BUILTIN_CMPPD:
   16648           24 :                       case IX86_BUILTIN_CMPPS:
   16649           24 :                       case IX86_BUILTIN_CMPPD256:
   16650           24 :                       case IX86_BUILTIN_CMPPS256:
   16651           24 :                         if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
   16652              :                           break;
   16653              :                         /* FALLTHRU */
   16654           49 :                       case IX86_BUILTIN_CMPEQPD:
   16655           49 :                       case IX86_BUILTIN_CMPEQPS:
   16656           49 :                         if (initializer_zerop (gimple_call_arg (def_stmt, 0))
   16657           49 :                             && initializer_zerop (gimple_call_arg (def_stmt,
   16658              :                                                                    1)))
   16659           49 :                           op0 = pc_rtx;
   16660              :                         break;
   16661              :                       default:
   16662              :                         break;
   16663              :                       }
   16664              :                 }
   16665              :             }
   16666              :         }
   16667              : 
   16668         1004 :       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
   16669         1004 :       if (! pat)
   16670            0 :         return const0_rtx;
   16671         1004 :       emit_insn (pat);
   16672              : 
   16673         1004 :       switch (fcode)
   16674              :         {
   16675           24 :         case IX86_BUILTIN_GATHER3DIV16SF:
   16676           24 :           if (target == NULL_RTX)
   16677            0 :             target = gen_reg_rtx (V8SFmode);
   16678           24 :           emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
   16679           24 :           break;
   16680           24 :         case IX86_BUILTIN_GATHER3DIV16SI:
   16681           24 :           if (target == NULL_RTX)
   16682            0 :             target = gen_reg_rtx (V8SImode);
   16683           24 :           emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
   16684           24 :           break;
   16685           28 :         case IX86_BUILTIN_GATHER3DIV8SF:
   16686           28 :         case IX86_BUILTIN_GATHERDIV8SF:
   16687           28 :           if (target == NULL_RTX)
   16688            0 :             target = gen_reg_rtx (V4SFmode);
   16689           28 :           emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
   16690           28 :           break;
   16691           28 :         case IX86_BUILTIN_GATHER3DIV8SI:
   16692           28 :         case IX86_BUILTIN_GATHERDIV8SI:
   16693           28 :           if (target == NULL_RTX)
   16694            0 :             target = gen_reg_rtx (V4SImode);
   16695           28 :           emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
   16696           28 :           break;
   16697              :         default:
   16698              :           target = subtarget;
   16699              :           break;
   16700              :         }
   16701              :       return target;
   16702              : 
   16703          623 :     scatter_gen:
   16704          623 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16705          623 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16706          623 :       arg2 = CALL_EXPR_ARG (exp, 2);
   16707          623 :       arg3 = CALL_EXPR_ARG (exp, 3);
   16708          623 :       arg4 = CALL_EXPR_ARG (exp, 4);
   16709          623 :       op0 = expand_normal (arg0);
   16710          623 :       op1 = ix86_expand_unsigned_small_int_cst_argument (arg1);
   16711          623 :       op2 = expand_normal (arg2);
   16712          623 :       op3 = expand_normal (arg3);
   16713          623 :       op4 = expand_normal (arg4);
   16714          623 :       mode1 = insn_data[icode].operand[1].mode;
   16715          623 :       mode2 = insn_data[icode].operand[2].mode;
   16716          623 :       mode3 = insn_data[icode].operand[3].mode;
   16717          623 :       mode4 = insn_data[icode].operand[4].mode;
   16718              : 
   16719              :       /* Scatter instruction stores operand op3 to memory with
   16720              :          indices from op2 and scale from op4 under writemask op1.
   16721              :          If index operand op2 has more elements then source operand
   16722              :          op3 one need to use only its low half. And vice versa.  */
   16723          623 :       switch (fcode)
   16724              :         {
   16725           24 :         case IX86_BUILTIN_SCATTERALTSIV8DF:
   16726           24 :         case IX86_BUILTIN_SCATTERALTSIV8DI:
   16727           24 :           half = gen_reg_rtx (V8SImode);
   16728           24 :           if (!nonimmediate_operand (op2, V16SImode))
   16729            0 :             op2 = copy_to_mode_reg (V16SImode, op2);
   16730           24 :           emit_insn (gen_vec_extract_lo_v16si (half, op2));
   16731           24 :           op2 = half;
   16732           24 :           break;
   16733           36 :         case IX86_BUILTIN_SCATTERALTDIV16SF:
   16734           36 :         case IX86_BUILTIN_SCATTERALTDIV16SI:
   16735           36 :           half = gen_reg_rtx (mode3);
   16736           36 :           if (mode3 == V8SFmode)
   16737              :             gen = gen_vec_extract_lo_v16sf;
   16738              :           else
   16739           24 :             gen = gen_vec_extract_lo_v16si;
   16740           36 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16741            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16742           36 :           emit_insn (gen (half, op3));
   16743           36 :           op3 = half;
   16744           36 :           break;
   16745            8 :         case IX86_BUILTIN_SCATTERALTSIV4DF:
   16746            8 :         case IX86_BUILTIN_SCATTERALTSIV4DI:
   16747            8 :           half = gen_reg_rtx (V4SImode);
   16748            8 :           if (!nonimmediate_operand (op2, V8SImode))
   16749            0 :             op2 = copy_to_mode_reg (V8SImode, op2);
   16750            8 :           emit_insn (gen_vec_extract_lo_v8si (half, op2));
   16751            8 :           op2 = half;
   16752            8 :           break;
   16753            8 :         case IX86_BUILTIN_SCATTERALTDIV8SF:
   16754            8 :         case IX86_BUILTIN_SCATTERALTDIV8SI:
   16755            8 :           half = gen_reg_rtx (mode3);
   16756            8 :           if (mode3 == V4SFmode)
   16757              :             gen = gen_vec_extract_lo_v8sf;
   16758              :           else
   16759            4 :             gen = gen_vec_extract_lo_v8si;
   16760            8 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16761            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16762            8 :           emit_insn (gen (half, op3));
   16763            8 :           op3 = half;
   16764            8 :           break;
   16765           16 :         case IX86_BUILTIN_SCATTERALTSIV2DF:
   16766           16 :         case IX86_BUILTIN_SCATTERALTSIV2DI:
   16767           16 :           if (!nonimmediate_operand (op2, V4SImode))
   16768            0 :             op2 = copy_to_mode_reg (V4SImode, op2);
   16769              :           break;
   16770           16 :         case IX86_BUILTIN_SCATTERALTDIV4SF:
   16771           16 :         case IX86_BUILTIN_SCATTERALTDIV4SI:
   16772           16 :           if (!nonimmediate_operand (op3, GET_MODE (op3)))
   16773            0 :             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   16774              :           break;
   16775              :         default:
   16776              :           break;
   16777              :         }
   16778              : 
   16779              :       /* Force memory operand only with base register here.  But we
   16780              :          don't want to do it on memory operand for other builtin
   16781              :          functions.  */
   16782          633 :       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
   16783              : 
   16784          628 :       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   16785            0 :         op0 = copy_to_mode_reg (Pmode, op0);
   16786              : 
   16787          623 :       op1 = fixup_modeless_constant (op1, mode1);
   16788              : 
   16789          623 :       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
   16790              :         {
   16791          607 :           if (!insn_data[icode].operand[1].predicate (op1, mode1))
   16792          273 :             op1 = copy_to_mode_reg (mode1, op1);
   16793              :         }
   16794              :       else
   16795              :         {
   16796           16 :           op1 = copy_to_reg (op1);
   16797           16 :           op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
   16798              :         }
   16799              : 
   16800          623 :       if (!insn_data[icode].operand[2].predicate (op2, mode2))
   16801           57 :         op2 = copy_to_mode_reg (mode2, op2);
   16802              : 
   16803          623 :       if (!insn_data[icode].operand[3].predicate (op3, mode3))
   16804           82 :         op3 = copy_to_mode_reg (mode3, op3);
   16805              : 
   16806          623 :       if (!insn_data[icode].operand[4].predicate (op4, mode4))
   16807              :         {
   16808            0 :           error ("the last argument must be scale 1, 2, 4, 8");
   16809            0 :           return const0_rtx;
   16810              :         }
   16811              : 
   16812          623 :       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
   16813          623 :       if (! pat)
   16814            0 :         return const0_rtx;
   16815              : 
   16816          623 :       emit_insn (pat);
   16817          623 :       return 0;
   16818              : 
   16819           23 :     case IX86_BUILTIN_XABORT:
   16820           23 :       icode = CODE_FOR_xabort;
   16821           23 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16822           23 :       op0 = expand_normal (arg0);
   16823           23 :       mode0 = insn_data[icode].operand[0].mode;
   16824           23 :       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   16825              :         {
   16826            0 :           error ("the argument to %<xabort%> intrinsic must "
   16827              :                  "be an 8-bit immediate");
   16828            0 :           return const0_rtx;
   16829              :         }
   16830           23 :       emit_insn (gen_xabort (op0));
   16831           23 :       return 0;
   16832              : 
   16833           55 :     case IX86_BUILTIN_RDSSPD:
   16834           55 :     case IX86_BUILTIN_RDSSPQ:
   16835           55 :       mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
   16836              : 
   16837           55 :       if (target == 0
   16838           55 :           || !register_operand (target, mode))
   16839            0 :         target = gen_reg_rtx (mode);
   16840              : 
   16841           55 :       op0 = force_reg (mode, const0_rtx);
   16842              : 
   16843           55 :       emit_insn (gen_rdssp (mode, target, op0));
   16844           55 :       return target;
   16845              : 
   16846           55 :     case IX86_BUILTIN_INCSSPD:
   16847           55 :     case IX86_BUILTIN_INCSSPQ:
   16848           55 :       mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
   16849              : 
   16850           55 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16851           55 :       op0 = expand_normal (arg0);
   16852              : 
   16853           55 :       op0 = force_reg (mode, op0);
   16854              : 
   16855           55 :       emit_insn (gen_incssp (mode, op0));
   16856           55 :       return 0;
   16857              : 
   16858           20 :     case IX86_BUILTIN_HRESET:
   16859           20 :       icode = CODE_FOR_hreset;
   16860           20 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16861           20 :       op0 = expand_normal (arg0);
   16862           20 :       op0 = force_reg (SImode, op0);
   16863           20 :       emit_insn (gen_hreset (op0));
   16864           20 :       return 0;
   16865              : 
   16866           38 :     case IX86_BUILTIN_RSTORSSP:
   16867           38 :     case IX86_BUILTIN_CLRSSBSY:
   16868           38 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16869           38 :       op0 = expand_normal (arg0);
   16870           19 :       icode = (fcode == IX86_BUILTIN_RSTORSSP
   16871           38 :                ? CODE_FOR_rstorssp
   16872              :                : CODE_FOR_clrssbsy);
   16873              : 
   16874           38 :       if (!address_operand (op0, VOIDmode))
   16875              :         {
   16876           18 :           op0 = convert_memory_address (Pmode, op0);
   16877           18 :           op0 = copy_addr_to_reg (op0);
   16878              :         }
   16879           38 :       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
   16880           38 :       return 0;
   16881              : 
   16882           76 :     case IX86_BUILTIN_WRSSD:
   16883           76 :     case IX86_BUILTIN_WRSSQ:
   16884           76 :     case IX86_BUILTIN_WRUSSD:
   16885           76 :     case IX86_BUILTIN_WRUSSQ:
   16886           76 :       mode = ((fcode == IX86_BUILTIN_WRSSD
   16887           76 :                || fcode == IX86_BUILTIN_WRUSSD)
   16888           76 :               ? SImode : DImode);
   16889              : 
   16890           76 :       arg0 = CALL_EXPR_ARG (exp, 0);
   16891           76 :       op0 = expand_normal (arg0);
   16892           76 :       arg1 = CALL_EXPR_ARG (exp, 1);
   16893           76 :       op1 = expand_normal (arg1);
   16894              : 
   16895           76 :       op0 = force_reg (mode, op0);
   16896              : 
   16897           76 :       if (!address_operand (op1, VOIDmode))
   16898              :         {
   16899           36 :           op1 = convert_memory_address (Pmode, op1);
   16900           36 :           op1 = copy_addr_to_reg (op1);
   16901              :         }
   16902           76 :       op1 = gen_rtx_MEM (mode, op1);
   16903              : 
   16904           76 :       icode = ((fcode == IX86_BUILTIN_WRSSD
   16905           76 :                 || fcode == IX86_BUILTIN_WRSSQ)
   16906           76 :                ? code_for_wrss (mode)
   16907           38 :                : code_for_wruss (mode));
   16908           76 :       emit_insn (GEN_FCN (icode) (op0, op1));
   16909              : 
   16910           76 :       return 0;
   16911              : 
   16912       114949 :     default:
   16913       114949 :       break;
   16914              :     }
   16915              : 
   16916       114949 :   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
   16917       114949 :       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
   16918              :     {
   16919        27043 :       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
   16920        27043 :       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
   16921        27043 :                                                target);
   16922              :     }
   16923              : 
   16924        87906 :   if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
   16925        87906 :       && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
   16926              :     {
   16927           93 :       i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
   16928           93 :       return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
   16929           93 :                                                target);
   16930              :     }
   16931              : 
   16932        87813 :   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
   16933        87813 :       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
   16934              :     {
   16935        69465 :       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
   16936              : 
   16937        69465 :       switch (fcode)
   16938              :         {
   16939            0 :           case IX86_BUILTIN_RDPID:
   16940            0 :             return ix86_expand_special_args_builtin (bdesc_args + i, exp,
   16941            0 :                                                      target);
   16942           72 :           case IX86_BUILTIN_VCOMISBF16EQ:
   16943           72 :           case IX86_BUILTIN_VCOMISBF16NE:
   16944           72 :           case IX86_BUILTIN_VCOMISBF16GT:
   16945           72 :           case IX86_BUILTIN_VCOMISBF16GE:
   16946           72 :           case IX86_BUILTIN_VCOMISBF16LT:
   16947           72 :           case IX86_BUILTIN_VCOMISBF16LE:
   16948           72 :             return ix86_expand_sse_comi (bdesc_args + i, exp, target, false);
   16949           15 :           case IX86_BUILTIN_FABSQ:
   16950           15 :           case IX86_BUILTIN_COPYSIGNQ:
   16951           15 :             if (!TARGET_SSE)
   16952              :               /* Emit a normal call if SSE isn't available.  */
   16953            0 :               return expand_call (exp, target, ignore);
   16954              :             /* FALLTHRU */
   16955        69393 :           default:
   16956        69393 :             return ix86_expand_args_builtin (bdesc_args + i, exp, target);
   16957              :           }
   16958              :     }
   16959              : 
   16960        18348 :   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
   16961        18348 :       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
   16962              :     {
   16963          473 :       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
   16964          473 :       return ix86_expand_sse_comi (bdesc_comi + i, exp, target, true);
   16965              :     }
   16966              : 
   16967        17875 :   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
   16968        17875 :       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
   16969              :     {
   16970        15554 :       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
   16971        15554 :       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
   16972              :     }
   16973              : 
   16974         2321 :   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
   16975         2321 :       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
   16976              :     {
   16977          216 :       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
   16978          216 :       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
   16979              :     }
   16980              : 
   16981         2105 :   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
   16982         2105 :       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
   16983              :     {
   16984          275 :       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
   16985          275 :       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
   16986              :     }
   16987              : 
   16988         1830 :   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
   16989         1830 :       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
   16990              :     {
   16991         1792 :       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
   16992         1792 :       const struct builtin_description *d = bdesc_multi_arg + i;
   16993         1792 :       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
   16994              :                                             (enum ix86_builtin_func_type)
   16995         1792 :                                             d->flag, d->comparison);
   16996              :     }
   16997              : 
   16998           38 :   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
   16999           38 :       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
   17000              :     {
   17001           38 :       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
   17002           38 :       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
   17003           38 :                                                target);
   17004              :     }
   17005              : 
   17006            0 :   gcc_unreachable ();
   17007              : }
   17008              : 
   17009              : /* See below where shifts are handled for explanation of this enum.  */
   17010              : enum ix86_vec_bcast_alg
   17011              : {
   17012              :   VEC_BCAST_PXOR,
   17013              :   VEC_BCAST_PCMPEQ,
   17014              :   VEC_BCAST_PABSB,
   17015              :   VEC_BCAST_PADDB,
   17016              :   VEC_BCAST_PSRLW,
   17017              :   VEC_BCAST_PSRLD,
   17018              :   VEC_BCAST_PSLLW,
   17019              :   VEC_BCAST_PSLLD
   17020              : };
   17021              : 
   17022              : struct ix86_vec_bcast_map_simode_t
   17023              : {
   17024              :   unsigned int key;
   17025              :   enum ix86_vec_bcast_alg alg;
   17026              :   unsigned int arg;
   17027              : };
   17028              : 
   17029              : /* This table must be kept sorted as values are looked-up using bsearch.  */
   17030              : static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = {
   17031              :   { 0x00000000, VEC_BCAST_PXOR,    0 },
   17032              :   { 0x00000001, VEC_BCAST_PSRLD,  31 },
   17033              :   { 0x00000003, VEC_BCAST_PSRLD,  30 },
   17034              :   { 0x00000007, VEC_BCAST_PSRLD,  29 },
   17035              :   { 0x0000000f, VEC_BCAST_PSRLD,  28 },
   17036              :   { 0x0000001f, VEC_BCAST_PSRLD,  27 },
   17037              :   { 0x0000003f, VEC_BCAST_PSRLD,  26 },
   17038              :   { 0x0000007f, VEC_BCAST_PSRLD,  25 },
   17039              :   { 0x000000ff, VEC_BCAST_PSRLD,  24 },
   17040              :   { 0x000001ff, VEC_BCAST_PSRLD,  23 },
   17041              :   { 0x000003ff, VEC_BCAST_PSRLD,  22 },
   17042              :   { 0x000007ff, VEC_BCAST_PSRLD,  21 },
   17043              :   { 0x00000fff, VEC_BCAST_PSRLD,  20 },
   17044              :   { 0x00001fff, VEC_BCAST_PSRLD,  19 },
   17045              :   { 0x00003fff, VEC_BCAST_PSRLD,  18 },
   17046              :   { 0x00007fff, VEC_BCAST_PSRLD,  17 },
   17047              :   { 0x0000ffff, VEC_BCAST_PSRLD,  16 },
   17048              :   { 0x00010001, VEC_BCAST_PSRLW,  15 },
   17049              :   { 0x0001ffff, VEC_BCAST_PSRLD,  15 },
   17050              :   { 0x00030003, VEC_BCAST_PSRLW,  14 },
   17051              :   { 0x0003ffff, VEC_BCAST_PSRLD,  14 },
   17052              :   { 0x00070007, VEC_BCAST_PSRLW,  13 },
   17053              :   { 0x0007ffff, VEC_BCAST_PSRLD,  13 },
   17054              :   { 0x000f000f, VEC_BCAST_PSRLW,  12 },
   17055              :   { 0x000fffff, VEC_BCAST_PSRLD,  12 },
   17056              :   { 0x001f001f, VEC_BCAST_PSRLW,  11 },
   17057              :   { 0x001fffff, VEC_BCAST_PSRLD,  11 },
   17058              :   { 0x003f003f, VEC_BCAST_PSRLW,  10 },
   17059              :   { 0x003fffff, VEC_BCAST_PSRLD,  10 },
   17060              :   { 0x007f007f, VEC_BCAST_PSRLW,   9 },
   17061              :   { 0x007fffff, VEC_BCAST_PSRLD,   9 },
   17062              :   { 0x00ff00ff, VEC_BCAST_PSRLW,   8 },
   17063              :   { 0x00ffffff, VEC_BCAST_PSRLD,   8 },
   17064              :   { 0x01010101, VEC_BCAST_PABSB,   0 },
   17065              :   { 0x01ff01ff, VEC_BCAST_PSRLW,   7 },
   17066              :   { 0x01ffffff, VEC_BCAST_PSRLD,   7 },
   17067              :   { 0x03ff03ff, VEC_BCAST_PSRLW,   6 },
   17068              :   { 0x03ffffff, VEC_BCAST_PSRLD,   6 },
   17069              :   { 0x07ff07ff, VEC_BCAST_PSRLW,   5 },
   17070              :   { 0x07ffffff, VEC_BCAST_PSRLD,   5 },
   17071              :   { 0x0fff0fff, VEC_BCAST_PSRLW,   4 },
   17072              :   { 0x0fffffff, VEC_BCAST_PSRLD,   4 },
   17073              :   { 0x1fff1fff, VEC_BCAST_PSRLW,   3 },
   17074              :   { 0x1fffffff, VEC_BCAST_PSRLD,   3 },
   17075              :   { 0x3fff3fff, VEC_BCAST_PSRLW,   2 },
   17076              :   { 0x3fffffff, VEC_BCAST_PSRLD,   2 },
   17077              :   { 0x7fff7fff, VEC_BCAST_PSRLW,   1 },
   17078              :   { 0x7fffffff, VEC_BCAST_PSRLD,   1 },
   17079              :   { 0x80000000, VEC_BCAST_PSLLD,  31 },
   17080              :   { 0x80008000, VEC_BCAST_PSLLW,  15 },
   17081              :   { 0xc0000000, VEC_BCAST_PSLLD,  30 },
   17082              :   { 0xc000c000, VEC_BCAST_PSLLW,  14 },
   17083              :   { 0xe0000000, VEC_BCAST_PSLLD,  29 },
   17084              :   { 0xe000e000, VEC_BCAST_PSLLW,  13 },
   17085              :   { 0xf0000000, VEC_BCAST_PSLLD,  28 },
   17086              :   { 0xf000f000, VEC_BCAST_PSLLW,  12 },
   17087              :   { 0xf8000000, VEC_BCAST_PSLLD,  27 },
   17088              :   { 0xf800f800, VEC_BCAST_PSLLW,  11 },
   17089              :   { 0xfc000000, VEC_BCAST_PSLLD,  26 },
   17090              :   { 0xfc00fc00, VEC_BCAST_PSLLW,  10 },
   17091              :   { 0xfe000000, VEC_BCAST_PSLLD,  25 },
   17092              :   { 0xfe00fe00, VEC_BCAST_PSLLW,   9 },
   17093              :   { 0xfefefefe, VEC_BCAST_PADDB,   0 },
   17094              :   { 0xff000000, VEC_BCAST_PSLLD,  24 },
   17095              :   { 0xff00ff00, VEC_BCAST_PSLLW,   8 },
   17096              :   { 0xff800000, VEC_BCAST_PSLLD,  23 },
   17097              :   { 0xff80ff80, VEC_BCAST_PSLLW,   7 },
   17098              :   { 0xffc00000, VEC_BCAST_PSLLD,  22 },
   17099              :   { 0xffc0ffc0, VEC_BCAST_PSLLW,   6 },
   17100              :   { 0xffe00000, VEC_BCAST_PSLLD,  21 },
   17101              :   { 0xffe0ffe0, VEC_BCAST_PSLLW,   5 },
   17102              :   { 0xfff00000, VEC_BCAST_PSLLD,  20 },
   17103              :   { 0xfff0fff0, VEC_BCAST_PSLLW,   4 },
   17104              :   { 0xfff80000, VEC_BCAST_PSLLD,  19 },
   17105              :   { 0xfff8fff8, VEC_BCAST_PSLLW,   3 },
   17106              :   { 0xfffc0000, VEC_BCAST_PSLLD,  18 },
   17107              :   { 0xfffcfffc, VEC_BCAST_PSLLW,   2 },
   17108              :   { 0xfffe0000, VEC_BCAST_PSLLD,  17 },
   17109              :   { 0xfffefffe, VEC_BCAST_PSLLW,   1 },
   17110              :   { 0xffff0000, VEC_BCAST_PSLLD,  16 },
   17111              :   { 0xffff8000, VEC_BCAST_PSLLD,  15 },
   17112              :   { 0xffffc000, VEC_BCAST_PSLLD,  14 },
   17113              :   { 0xffffe000, VEC_BCAST_PSLLD,  13 },
   17114              :   { 0xfffff000, VEC_BCAST_PSLLD,  12 },
   17115              :   { 0xfffff800, VEC_BCAST_PSLLD,  11 },
   17116              :   { 0xfffffc00, VEC_BCAST_PSLLD,  10 },
   17117              :   { 0xfffffe00, VEC_BCAST_PSLLD,   9 },
   17118              :   { 0xffffff00, VEC_BCAST_PSLLD,   8 },
   17119              :   { 0xffffff80, VEC_BCAST_PSLLD,   7 },
   17120              :   { 0xffffffc0, VEC_BCAST_PSLLD,   6 },
   17121              :   { 0xffffffe0, VEC_BCAST_PSLLD,   5 },
   17122              :   { 0xfffffff0, VEC_BCAST_PSLLD,   4 },
   17123              :   { 0xfffffff8, VEC_BCAST_PSLLD,   3 },
   17124              :   { 0xfffffffc, VEC_BCAST_PSLLD,   2 },
   17125              :   { 0xfffffffe, VEC_BCAST_PSLLD,   1 },
   17126              :   { 0xffffffff, VEC_BCAST_PCMPEQ,  0 }
   17127              : };
   17128              : 
   17129              : /* Comparator for bsearch on ix86_vec_bcast_map.  */
   17130              : static int
   17131       288296 : ix86_vec_bcast_map_simode_cmp (const void *key, const void *entry)
   17132              : {
   17133       288296 :   return (*(const unsigned int*)key)
   17134       288296 :          - ((const ix86_vec_bcast_map_simode_t*)entry)->key;
   17135              : }
   17136              : 
   17137              : /* A subroutine of ix86_vector_duplicate_value.  Tries to efficiently
   17138              :    materialize V4SImode, V8SImode and V16SImode vectors from SImode
   17139              :    integer constants.  */
   17140              : static bool
   17141        44335 : ix86_vector_duplicate_simode_const (machine_mode mode, rtx target,
   17142              :                                     unsigned int val)
   17143              : {
   17144        44335 :   const ix86_vec_bcast_map_simode_t *entry;
   17145        44335 :   rtx tmp1, tmp2;
   17146              : 
   17147        44335 :   entry = (const ix86_vec_bcast_map_simode_t*)
   17148        44335 :           bsearch(&val, ix86_vec_bcast_map_simode,
   17149              :                   ARRAY_SIZE (ix86_vec_bcast_map_simode),
   17150              :                   sizeof (ix86_vec_bcast_map_simode_t),
   17151              :                   ix86_vec_bcast_map_simode_cmp);
   17152        44335 :   if (!entry)
   17153              :     return false;
   17154              : 
   17155        18933 :   switch (entry->alg)
   17156              :     {
   17157            0 :     case VEC_BCAST_PXOR:
   17158            0 :       if ((mode == V8SImode && !TARGET_AVX2)
   17159            0 :           || (mode == V16SImode && !TARGET_AVX512F))
   17160              :         return false;
   17161            0 :       emit_move_insn (target, CONST0_RTX (mode));
   17162            0 :       return true;
   17163              : 
   17164          145 :     case VEC_BCAST_PCMPEQ:
   17165          145 :       if ((mode == V4SImode && !TARGET_SSE2)
   17166          144 :           || (mode == V8SImode && !TARGET_AVX2)
   17167          117 :           || (mode == V16SImode && !TARGET_AVX512F))
   17168              :         return false;
   17169          117 :       emit_move_insn (target, CONSTM1_RTX (mode));
   17170          117 :       return true;
   17171              : 
   17172          646 :     case VEC_BCAST_PABSB:
   17173          646 :       if (mode == V4SImode && TARGET_SSE2)
   17174              :         {
   17175          508 :           tmp1 = gen_reg_rtx (V16QImode);
   17176          508 :           emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
   17177          508 :           tmp2 = gen_reg_rtx (V16QImode);
   17178          508 :           emit_insn (gen_absv16qi2 (tmp2, tmp1));
   17179              :         }
   17180          138 :       else if (mode == V8SImode && TARGET_AVX2)
   17181              :         {
   17182           80 :           tmp1 = gen_reg_rtx (V32QImode);
   17183           80 :           emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
   17184           80 :           tmp2 = gen_reg_rtx (V32QImode);
   17185           80 :           emit_insn (gen_absv32qi2 (tmp2, tmp1));
   17186              :         }
   17187           58 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17188              :         {
   17189           50 :           tmp1 = gen_reg_rtx (V64QImode);
   17190           50 :           emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
   17191           50 :           tmp2 = gen_reg_rtx (V64QImode);
   17192           50 :           emit_insn (gen_absv64qi2 (tmp2, tmp1));
   17193              :         }
   17194              :       else
   17195              :         return false;
   17196              :       break;
   17197              : 
   17198          102 :     case VEC_BCAST_PADDB:
   17199          102 :       if (mode == V4SImode && TARGET_SSE2)
   17200              :         {
   17201           95 :           tmp1 = gen_reg_rtx (V16QImode);
   17202           95 :           emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
   17203           95 :           tmp2 = gen_reg_rtx (V16QImode);
   17204           95 :           emit_insn (gen_addv16qi3 (tmp2, tmp1, tmp1));
   17205              :         }
   17206            7 :       else if (mode == V8SImode && TARGET_AVX2)
   17207              :         {
   17208            1 :           tmp1 = gen_reg_rtx (V32QImode);
   17209            1 :           emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
   17210            1 :           tmp2 = gen_reg_rtx (V32QImode);
   17211            1 :           emit_insn (gen_addv32qi3 (tmp2, tmp1, tmp1));
   17212              :         }
   17213            6 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17214              :         {
   17215            6 :           tmp1 = gen_reg_rtx (V64QImode);
   17216            6 :           emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
   17217            6 :           tmp2 = gen_reg_rtx (V64QImode);
   17218            6 :           emit_insn (gen_addv64qi3 (tmp2, tmp1, tmp1));
   17219              :         }
   17220              :       else
   17221              :         return false;
   17222              :       break;
   17223              : 
   17224         3677 :     case VEC_BCAST_PSRLW:
   17225         3677 :       if (mode == V4SImode && TARGET_SSE2)
   17226              :         {
   17227         3451 :           tmp1 = gen_reg_rtx (V8HImode);
   17228         3451 :           emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
   17229         3451 :           tmp2 = gen_reg_rtx (V8HImode);
   17230         3451 :           emit_insn (gen_lshrv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17231              :         }
   17232          226 :       else if (mode == V8SImode && TARGET_AVX2)
   17233              :         {
   17234          133 :           tmp1 = gen_reg_rtx (V16HImode);
   17235          133 :           emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
   17236          133 :           tmp2 = gen_reg_rtx (V16HImode);
   17237          133 :           emit_insn (gen_lshrv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17238              :         }
   17239           93 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17240              :         {
   17241           91 :           tmp1 = gen_reg_rtx (V32HImode);
   17242           91 :           emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
   17243           91 :           tmp2 = gen_reg_rtx (V32HImode);
   17244           91 :           emit_insn (gen_lshrv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17245              :         }
   17246              :       else
   17247              :         return false;
   17248              :       break;
   17249              : 
   17250        12694 :     case VEC_BCAST_PSRLD:
   17251        12694 :       if (mode == V4SImode && TARGET_SSE2)
   17252              :         {
   17253         9769 :           tmp1 = gen_reg_rtx (V4SImode);
   17254         9769 :           emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
   17255         9769 :           emit_insn (gen_lshrv4si3 (target, tmp1, GEN_INT (entry->arg)));
   17256         9769 :           return true;
   17257              :         }
   17258         2925 :       else if (mode == V8SImode && TARGET_AVX2)
   17259              :         {
   17260         1097 :           tmp1 = gen_reg_rtx (V8SImode);
   17261         1097 :           emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
   17262         1097 :           emit_insn (gen_lshrv8si3 (target, tmp1, GEN_INT (entry->arg)));
   17263         1097 :           return true;
   17264              :         }
   17265         1828 :       else if (mode == V16SImode && TARGET_AVX512F)
   17266              :         {
   17267          989 :           tmp1 = gen_reg_rtx (V16SImode);
   17268          989 :           emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
   17269          989 :           emit_insn (gen_lshrv16si3 (target, tmp1, GEN_INT (entry->arg)));
   17270          989 :           return true;
   17271              :         }
   17272              :       else
   17273              :         return false;
   17274          124 :       break;
   17275              : 
   17276          124 :     case VEC_BCAST_PSLLW:
   17277          124 :       if (mode == V4SImode && TARGET_SSE2)
   17278              :         {
   17279           94 :           tmp1 = gen_reg_rtx (V8HImode);
   17280           94 :           emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
   17281           94 :           tmp2 = gen_reg_rtx (V8HImode);
   17282           94 :           emit_insn (gen_ashlv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17283              :         }
   17284           30 :       else if (mode == V8SImode && TARGET_AVX2)
   17285              :         {
   17286           21 :           tmp1 = gen_reg_rtx (V16HImode);
   17287           21 :           emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
   17288           21 :           tmp2 = gen_reg_rtx (V16HImode);
   17289           21 :           emit_insn (gen_ashlv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17290              :         }
   17291            9 :       else if (mode == V16SImode && TARGET_AVX512BW)
   17292              :         {
   17293            9 :           tmp1 = gen_reg_rtx (V32HImode);
   17294            9 :           emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
   17295            9 :           tmp2 = gen_reg_rtx (V32HImode);
   17296            9 :           emit_insn (gen_ashlv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
   17297              :         }
   17298              :       else
   17299              :         return false;
   17300              :       break;
   17301              : 
   17302         1545 :     case VEC_BCAST_PSLLD:
   17303         1545 :       if (mode == V4SImode && TARGET_SSE2)
   17304              :         {
   17305         1513 :           tmp1 = gen_reg_rtx (V4SImode);
   17306         1513 :           emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
   17307         1513 :           emit_insn (gen_ashlv4si3 (target, tmp1, GEN_INT (entry->arg)));
   17308         1513 :           return true;
   17309              :         }
   17310           32 :       else if (mode == V8SImode && TARGET_AVX2)
   17311              :         {
   17312           15 :           tmp1 = gen_reg_rtx (V8SImode);
   17313           15 :           emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
   17314           15 :           emit_insn (gen_ashlv8si3 (target, tmp1, GEN_INT (entry->arg)));
   17315           15 :           return true;
   17316              :         }
   17317           17 :       else if (mode == V16SImode && TARGET_AVX512F)
   17318              :         {
   17319           17 :           tmp1 = gen_reg_rtx (V16SImode);
   17320           17 :           emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
   17321           17 :           emit_insn (gen_ashlv16si3 (target, tmp1, GEN_INT (entry->arg)));
   17322           17 :           return true;
   17323              :         }
   17324              :       else
   17325              :         return false;
   17326              : 
   17327              :     default:
   17328              :       return false;
   17329              :     }
   17330              : 
   17331         4539 :   emit_move_insn (target, gen_lowpart (mode, tmp2));
   17332         4539 :   return true;
   17333              : }
   17334              : 
   17335              : /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
   17336              :    fill target with val via vec_duplicate.  */
   17337              : 
   17338              : static bool
   17339       143847 : ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
   17340              : {
   17341       143847 :   bool ok;
   17342       143847 :   rtx_insn *insn;
   17343       143847 :   rtx dup;
   17344              : 
   17345       143847 :   if ((mode == V4SImode || mode == V8SImode || mode == V16SImode)
   17346        52301 :       && CONST_INT_P (val)
   17347        44335 :       && ix86_vector_duplicate_simode_const (mode, target, INTVAL (val)))
   17348              :     return true;
   17349              : 
   17350              :   /* Save/restore recog_data in case this is called from splitters
   17351              :      or other routines where recog_data needs to stay valid across
   17352              :      force_reg.  See PR106577.  */
   17353       125791 :   recog_data_d recog_data_save = recog_data;
   17354              : 
   17355              :   /* First attempt to recognize VAL as-is.  */
   17356       125791 :   dup = gen_vec_duplicate (mode, val);
   17357       125791 :   insn = emit_insn (gen_rtx_SET (target, dup));
   17358       125791 :   if (recog_memoized (insn) < 0)
   17359              :     {
   17360        88315 :       rtx_insn *seq;
   17361        88315 :       machine_mode innermode = GET_MODE_INNER (mode);
   17362        88315 :       rtx reg;
   17363              : 
   17364              :       /* If that fails, force VAL into a register or mem.  */
   17365              : 
   17366        88315 :       start_sequence ();
   17367              : 
   17368            0 :       if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
   17369            0 :           && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
   17370        88315 :           && GET_MODE_BITSIZE(mode) >= 128)
   17371            0 :         reg = validize_mem (force_const_mem (innermode, val));
   17372              :       else
   17373              :         {
   17374        88315 :           reg = force_reg (innermode, val);
   17375        88315 :           if (GET_MODE (reg) != innermode)
   17376            0 :             reg = gen_lowpart (innermode, reg);
   17377              :         }
   17378              : 
   17379        88315 :       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
   17380        88315 :       seq = end_sequence ();
   17381        88315 :       if (seq)
   17382        88315 :         emit_insn_before (seq, insn);
   17383              : 
   17384        88315 :       ok = recog_memoized (insn) >= 0;
   17385        88315 :       gcc_assert (ok);
   17386              :     }
   17387       125791 :   recog_data = recog_data_save;
   17388       125791 :   return true;
   17389              : }
   17390              : 
   17391              : /* Get a vector mode of the same size as the original but with elements
   17392              :    twice as wide.  This is only guaranteed to apply to integral vectors.  */
   17393              : 
   17394              : static machine_mode
   17395        18217 : get_mode_wider_vector (machine_mode o)
   17396              : {
   17397              :   /* ??? Rely on the ordering that genmodes.cc gives to vectors.  */
   17398        18217 :   machine_mode n = GET_MODE_NEXT_MODE (o).require ();
   17399        54651 :   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
   17400        54651 :   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
   17401        18217 :   return n;
   17402              : }
   17403              : 
   17404              : static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
   17405              : static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
   17406              : 
   17407              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   17408              :    with all elements equal to VAR.  Return true if successful.  */
   17409              : 
   17410              : bool
   17411       163384 : ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
   17412              :                                    rtx target, rtx val)
   17413              : {
   17414       163384 :   bool ok;
   17415              : 
   17416       163384 :   switch (mode)
   17417              :     {
   17418        69937 :     case E_V2DImode:
   17419        69937 :       if (CONST_INT_P (val))
   17420              :         {
   17421        61190 :           int tmp = (int)INTVAL (val);
   17422        61190 :           if (tmp == (int)(INTVAL (val) >> 32))
   17423              :             {
   17424          166 :               rtx reg = gen_reg_rtx (V4SImode);
   17425          166 :               ok = ix86_vector_duplicate_value (V4SImode, reg,
   17426              :                                                 GEN_INT (tmp));
   17427          166 :               if (ok)
   17428              :                 {
   17429          166 :                   emit_move_insn (target, gen_lowpart (V2DImode, reg));
   17430          166 :                   return true;
   17431              :                 }
   17432              :             }
   17433              :         }
   17434        69771 :       return ix86_vector_duplicate_value (mode, target, val);
   17435              : 
   17436         1092 :     case E_V4DImode:
   17437         1092 :       if (CONST_INT_P (val))
   17438              :         {
   17439          781 :           int tmp = (int)INTVAL (val);
   17440          781 :           if (tmp == (int)(INTVAL (val) >> 32))
   17441              :             {
   17442           54 :               rtx reg = gen_reg_rtx (V8SImode);
   17443           54 :               ok = ix86_vector_duplicate_value (V8SImode, reg,
   17444              :                                                 GEN_INT (tmp));
   17445           54 :               if (ok)
   17446              :                 {
   17447           54 :                   emit_move_insn (target, gen_lowpart (V4DImode, reg));
   17448           54 :                   return true;
   17449              :                 }
   17450              :             }
   17451              :         }
   17452         1038 :       return ix86_vector_duplicate_value (mode, target, val);
   17453              : 
   17454          513 :     case E_V8DImode:
   17455          513 :       if (CONST_INT_P (val))
   17456              :         {
   17457          264 :           int tmp = (int)INTVAL (val);
   17458          264 :           if (tmp == (int)(INTVAL (val) >> 32))
   17459              :             {
   17460           24 :               rtx reg = gen_reg_rtx (V16SImode);
   17461           24 :               ok = ix86_vector_duplicate_value (V16SImode, reg,
   17462              :                                                 GEN_INT (tmp));
   17463           24 :               if (ok)
   17464              :                 {
   17465           24 :                   emit_move_insn (target, gen_lowpart (V8DImode, reg));
   17466           24 :                   return true;
   17467              :                 }
   17468              :             }
   17469              :         }
   17470          489 :       return ix86_vector_duplicate_value (mode, target, val);
   17471              : 
   17472         2610 :     case E_V2SImode:
   17473         2610 :     case E_V2SFmode:
   17474         2610 :       if (!mmx_ok)
   17475              :         return false;
   17476              :       /* FALLTHRU */
   17477              : 
   17478        71319 :     case E_V4DFmode:
   17479        71319 :     case E_V8SFmode:
   17480        71319 :     case E_V8SImode:
   17481        71319 :     case E_V2DFmode:
   17482        71319 :     case E_V4SFmode:
   17483        71319 :     case E_V4SImode:
   17484        71319 :     case E_V16SImode:
   17485        71319 :     case E_V16SFmode:
   17486        71319 :     case E_V8DFmode:
   17487        71319 :       return ix86_vector_duplicate_value (mode, target, val);
   17488              : 
   17489          398 :     case E_V4HImode:
   17490          398 :       if (!mmx_ok)
   17491              :         return false;
   17492          395 :       if (TARGET_SSE || TARGET_3DNOW_A)
   17493              :         {
   17494          395 :           rtx x;
   17495              : 
   17496          395 :           val = gen_lowpart (SImode, val);
   17497          395 :           if (CONST_INT_P (val))
   17498              :             return false;
   17499          393 :           x = gen_rtx_TRUNCATE (HImode, val);
   17500          393 :           x = gen_rtx_VEC_DUPLICATE (mode, x);
   17501          393 :           emit_insn (gen_rtx_SET (target, x));
   17502          393 :           return true;
   17503              :         }
   17504            0 :       goto widen;
   17505              : 
   17506            5 :     case E_V4HFmode:
   17507            5 :     case E_V4BFmode:
   17508            5 :       if (TARGET_MMX_WITH_SSE)
   17509              :         {
   17510           10 :           val = force_reg (GET_MODE_INNER (mode), val);
   17511            5 :           rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
   17512            5 :           emit_insn (gen_rtx_SET (target, x));
   17513            5 :           return true;
   17514              :         }
   17515              :       return false;
   17516              : 
   17517          108 :     case E_V2HImode:
   17518          108 :       if (TARGET_SSE2)
   17519              :         {
   17520          108 :           rtx x;
   17521              : 
   17522          108 :           val = gen_lowpart (SImode, val);
   17523          108 :           if (CONST_INT_P (val))
   17524              :             return false;
   17525          108 :           x = gen_rtx_TRUNCATE (HImode, val);
   17526          108 :           x = gen_rtx_VEC_DUPLICATE (mode, x);
   17527          108 :           emit_insn (gen_rtx_SET (target, x));
   17528          108 :           return true;
   17529              :         }
   17530              :       return false;
   17531              : 
   17532            3 :     case E_V2HFmode:
   17533            3 :     case E_V2BFmode:
   17534            3 :       if (TARGET_SSE2)
   17535              :         {
   17536            6 :           val = force_reg (GET_MODE_INNER (mode), val);
   17537            3 :           rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
   17538            3 :           emit_insn (gen_rtx_SET (target, x));
   17539            3 :           return true;
   17540              :         }
   17541              :       return false;
   17542              : 
   17543          297 :     case E_V8QImode:
   17544          297 :     case E_V4QImode:
   17545          297 :       if (!mmx_ok)
   17546              :         return false;
   17547          293 :       goto widen;
   17548              : 
   17549         9827 :     case E_V8HImode:
   17550         9827 :       if (CONST_INT_P (val))
   17551         9326 :         goto widen;
   17552              :       /* FALLTHRU */
   17553              : 
   17554          815 :     case E_V8HFmode:
   17555          815 :     case E_V8BFmode:
   17556          815 :       if (TARGET_AVX2)
   17557          391 :         return ix86_vector_duplicate_value (mode, target, val);
   17558              : 
   17559          424 :       if (TARGET_SSE2)
   17560              :         {
   17561         1106 :           struct expand_vec_perm_d dperm;
   17562         1106 :           rtx tmp1, tmp2;
   17563              : 
   17564          424 :         permute:
   17565         1106 :           memset (&dperm, 0, sizeof (dperm));
   17566         1106 :           dperm.target = target;
   17567         1106 :           dperm.vmode = mode;
   17568         1106 :           dperm.nelt = GET_MODE_NUNITS (mode);
   17569         1106 :           dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
   17570         1106 :           dperm.one_operand_p = true;
   17571              : 
   17572         1106 :           if (mode == V8HFmode || mode == V8BFmode)
   17573              :             {
   17574            3 :               tmp1 = force_reg (GET_MODE_INNER (mode), val);
   17575            3 :               tmp2 = gen_reg_rtx (mode);
   17576            3 :               emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
   17577            3 :               tmp1 = gen_lowpart (mode, tmp2);
   17578              :             }
   17579              :           else
   17580              :             {
   17581              :               /* Extend to SImode using a paradoxical SUBREG.  */
   17582         1103 :               tmp1 = gen_reg_rtx (SImode);
   17583         1103 :               emit_move_insn (tmp1, gen_lowpart (SImode, val));
   17584              : 
   17585              :               /* Insert the SImode value as
   17586              :                  low element of a V4SImode vector.  */
   17587         1103 :               tmp2 = gen_reg_rtx (V4SImode);
   17588         1103 :               emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
   17589         1103 :               tmp1 = gen_lowpart (mode, tmp2);
   17590              :             }
   17591              : 
   17592         1106 :           emit_move_insn (dperm.op0, tmp1);
   17593         1106 :           ok = (expand_vec_perm_1 (&dperm)
   17594         1106 :                 || expand_vec_perm_broadcast_1 (&dperm));
   17595            0 :           gcc_assert (ok);
   17596         1106 :           return ok;
   17597              :         }
   17598            0 :       goto widen;
   17599              : 
   17600         5618 :     case E_V16QImode:
   17601         5618 :       if (CONST_INT_P (val))
   17602         4880 :         goto widen;
   17603          738 :       if (TARGET_AVX2)
   17604           56 :         return ix86_vector_duplicate_value (mode, target, val);
   17605              : 
   17606          682 :       if (TARGET_SSE2)
   17607          682 :         goto permute;
   17608            0 :       goto widen;
   17609              : 
   17610        16703 :     widen:
   17611              :       /* Replicate the value once into the next wider mode and recurse.  */
   17612        16703 :       {
   17613        16703 :         machine_mode smode, wsmode, wvmode;
   17614        16703 :         rtx x;
   17615              : 
   17616        16703 :         smode = GET_MODE_INNER (mode);
   17617        16703 :         wvmode = get_mode_wider_vector (mode);
   17618        16703 :         wsmode = GET_MODE_INNER (wvmode);
   17619              : 
   17620        16703 :         val = convert_modes (wsmode, smode, val, true);
   17621              : 
   17622        16703 :         if (CONST_INT_P (val))
   17623              :           {
   17624        32822 :             x = simplify_binary_operation (ASHIFT, wsmode, val,
   17625        16411 :                                            GEN_INT (GET_MODE_BITSIZE (smode)));
   17626        16411 :             val = simplify_binary_operation (IOR, wsmode, val, x);
   17627              :           }
   17628          292 :         else if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
   17629          292 :           emit_insn (gen_insv_1 (wsmode, val, val));
   17630              :         else
   17631              :           {
   17632            0 :             x = expand_simple_binop (wsmode, ASHIFT, val,
   17633            0 :                                      GEN_INT (GET_MODE_BITSIZE (smode)),
   17634              :                                      NULL_RTX, 1, OPTAB_LIB_WIDEN);
   17635            0 :             val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
   17636              :                                        OPTAB_LIB_WIDEN);
   17637              :           }
   17638              : 
   17639        16703 :         x = gen_reg_rtx (wvmode);
   17640        16703 :         ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
   17641        16703 :         if (!ok)
   17642              :           return false;
   17643        16702 :         emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
   17644        16702 :         return true;
   17645              :       }
   17646              : 
   17647         1435 :     case E_V16HImode:
   17648         1435 :     case E_V32QImode:
   17649         1435 :       if (CONST_INT_P (val))
   17650         1140 :         goto widen;
   17651              :       /* FALLTHRU */
   17652              : 
   17653          378 :     case E_V16HFmode:
   17654          378 :     case E_V16BFmode:
   17655          378 :       if (TARGET_AVX2)
   17656          350 :         return ix86_vector_duplicate_value (mode, target, val);
   17657              :       else
   17658              :         {
   17659           28 :           machine_mode hvmode;
   17660           28 :           switch (mode)
   17661              :             {
   17662              :             case V16HImode:
   17663              :               hvmode = V8HImode;
   17664              :               break;
   17665            0 :             case V16HFmode:
   17666            0 :               hvmode = V8HFmode;
   17667            0 :               break;
   17668            1 :             case V16BFmode:
   17669            1 :               hvmode = V8BFmode;
   17670            1 :               break;
   17671           14 :             case V32QImode:
   17672           14 :               hvmode = V16QImode;
   17673           14 :               break;
   17674            0 :             default:
   17675            0 :               gcc_unreachable ();
   17676              :             }
   17677           28 :           rtx x = gen_reg_rtx (hvmode);
   17678              : 
   17679           28 :           ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
   17680           28 :           if (!ok)
   17681              :             return false;
   17682              : 
   17683           28 :           x = gen_rtx_VEC_CONCAT (mode, x, x);
   17684           28 :           emit_insn (gen_rtx_SET (target, x));
   17685              :         }
   17686           28 :       return true;
   17687              : 
   17688         1194 :     case E_V32HImode:
   17689         1194 :     case E_V64QImode:
   17690         1194 :       if (CONST_INT_P (val))
   17691         1064 :         goto widen;
   17692              :       /* FALLTHRU */
   17693              : 
   17694          209 :     case E_V32HFmode:
   17695          209 :     case E_V32BFmode:
   17696          209 :       if (TARGET_AVX512BW)
   17697          189 :         return ix86_vector_duplicate_value (mode, target, val);
   17698              :       else
   17699              :         {
   17700           20 :           machine_mode hvmode;
   17701           20 :           switch (mode)
   17702              :             {
   17703              :             case V32HImode:
   17704              :               hvmode = V16HImode;
   17705              :               break;
   17706            0 :             case V32HFmode:
   17707            0 :               hvmode = V16HFmode;
   17708            0 :               break;
   17709            1 :             case V32BFmode:
   17710            1 :               hvmode = V16BFmode;
   17711            1 :               break;
   17712           10 :             case V64QImode:
   17713           10 :               hvmode = V32QImode;
   17714           10 :               break;
   17715            0 :             default:
   17716            0 :               gcc_unreachable ();
   17717              :             }
   17718           20 :           rtx x = gen_reg_rtx (hvmode);
   17719              : 
   17720           20 :           ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
   17721           20 :           if (!ok)
   17722              :             return false;
   17723              : 
   17724           20 :           x = gen_rtx_VEC_CONCAT (mode, x, x);
   17725           20 :           emit_insn (gen_rtx_SET (target, x));
   17726              :         }
   17727           20 :       return true;
   17728              : 
   17729              :     default:
   17730              :       return false;
   17731              :     }
   17732              : }
   17733              : 
   17734              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   17735              :    whose ONE_VAR element is VAR, and other elements are zero.  Return true
   17736              :    if successful.  */
   17737              : 
   17738              : bool
   17739        10632 : ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
   17740              :                                      rtx target, rtx var, int one_var)
   17741              : {
   17742        10632 :   machine_mode vsimode;
   17743        10632 :   rtx new_target;
   17744        10632 :   rtx x, tmp;
   17745        10632 :   bool use_vector_set = false;
   17746        10632 :   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
   17747              : 
   17748        10632 :   switch (mode)
   17749              :     {
   17750         8262 :     case E_V2DImode:
   17751              :       /* For SSE4.1, we normally use vector set.  But if the second
   17752              :          element is zero and inter-unit moves are OK, we use movq
   17753              :          instead.  */
   17754         8253 :       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
   17755         8385 :                         && !(TARGET_INTER_UNIT_MOVES_TO_VEC
   17756              :                              && one_var == 0));
   17757              :       break;
   17758          872 :     case E_V16QImode:
   17759          872 :     case E_V4SImode:
   17760          872 :     case E_V4SFmode:
   17761          872 :       use_vector_set = TARGET_SSE4_1;
   17762          872 :       break;
   17763           85 :     case E_V8HImode:
   17764           85 :       use_vector_set = TARGET_SSE2;
   17765           85 :       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
   17766           85 :         ? gen_vec_setv8hi_0 : NULL;
   17767              :       break;
   17768            8 :     case E_V8QImode:
   17769            8 :       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   17770              :       break;
   17771           14 :     case E_V4HImode:
   17772           14 :     case E_V4HFmode:
   17773           14 :     case E_V4BFmode:
   17774           14 :       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
   17775              :       break;
   17776           32 :     case E_V4QImode:
   17777           32 :       use_vector_set = TARGET_SSE4_1;
   17778           32 :       break;
   17779            0 :     case E_V32QImode:
   17780            0 :       use_vector_set = TARGET_AVX;
   17781            0 :       break;
   17782            5 :     case E_V16HImode:
   17783            5 :       use_vector_set = TARGET_AVX;
   17784            5 :       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
   17785            5 :         ? gen_vec_setv16hi_0 : NULL;
   17786              :       break;
   17787            5 :     case E_V8SImode:
   17788            5 :       use_vector_set = TARGET_AVX;
   17789            5 :       gen_vec_set_0 = gen_vec_setv8si_0;
   17790            5 :       break;
   17791           22 :     case E_V8SFmode:
   17792           22 :       use_vector_set = TARGET_AVX;
   17793           22 :       gen_vec_set_0 = gen_vec_setv8sf_0;
   17794           22 :       break;
   17795           13 :     case E_V4DFmode:
   17796           13 :       use_vector_set = TARGET_AVX;
   17797           13 :       gen_vec_set_0 = gen_vec_setv4df_0;
   17798           13 :       break;
   17799            7 :     case E_V4DImode:
   17800              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   17801            7 :       use_vector_set = TARGET_AVX && TARGET_64BIT;
   17802              :       gen_vec_set_0 = gen_vec_setv4di_0;
   17803              :       break;
   17804           17 :     case E_V16SImode:
   17805           17 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17806              :       gen_vec_set_0 = gen_vec_setv16si_0;
   17807              :       break;
   17808           22 :     case E_V16SFmode:
   17809           22 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17810              :       gen_vec_set_0 = gen_vec_setv16sf_0;
   17811              :       break;
   17812            0 :     case E_V8DFmode:
   17813            0 :       use_vector_set = TARGET_AVX512F && one_var == 0;
   17814              :       gen_vec_set_0 = gen_vec_setv8df_0;
   17815              :       break;
   17816            2 :     case E_V8DImode:
   17817              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   17818            2 :       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
   17819              :       gen_vec_set_0 = gen_vec_setv8di_0;
   17820              :       break;
   17821           39 :     case E_V8HFmode:
   17822           39 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17823              :       gen_vec_set_0 = gen_vec_setv8hf_0;
   17824              :       break;
   17825            9 :     case E_V16HFmode:
   17826            9 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17827              :       gen_vec_set_0 = gen_vec_setv16hf_0;
   17828              :       break;
   17829            6 :     case E_V32HFmode:
   17830            6 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17831              :       gen_vec_set_0 = gen_vec_setv32hf_0;
   17832              :       break;
   17833            2 :     case E_V8BFmode:
   17834            2 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17835              :       gen_vec_set_0 = gen_vec_setv8bf_0;
   17836              :       break;
   17837            0 :     case E_V16BFmode:
   17838            0 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17839              :       gen_vec_set_0 = gen_vec_setv16bf_0;
   17840              :       break;
   17841            0 :     case E_V32BFmode:
   17842            0 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17843              :       gen_vec_set_0 = gen_vec_setv32bf_0;
   17844              :       break;
   17845            4 :     case E_V32HImode:
   17846            4 :       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   17847              :       gen_vec_set_0 = gen_vec_setv32hi_0;
   17848              :     default:
   17849              :       break;
   17850              :     }
   17851              : 
   17852         9310 :   if (use_vector_set)
   17853              :     {
   17854          861 :       if (gen_vec_set_0 && one_var == 0)
   17855              :         {
   17856          354 :           var = force_reg (GET_MODE_INNER (mode), var);
   17857          177 :           emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
   17858          177 :           return true;
   17859              :         }
   17860          684 :       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
   17861         1368 :       var = force_reg (GET_MODE_INNER (mode), var);
   17862          684 :       ix86_expand_vector_set (mmx_ok, target, var, one_var);
   17863          684 :       return true;
   17864              :     }
   17865              : 
   17866         9771 :   switch (mode)
   17867              :     {
   17868         1110 :     case E_V2SFmode:
   17869         1110 :     case E_V2SImode:
   17870         1110 :       if (!mmx_ok)
   17871              :         return false;
   17872              :       /* FALLTHRU */
   17873              : 
   17874         8492 :     case E_V2DFmode:
   17875         8492 :     case E_V2DImode:
   17876         8492 :       if (one_var != 0)
   17877              :         return false;
   17878         5292 :       var = force_reg (GET_MODE_INNER (mode), var);
   17879         5292 :       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
   17880         2646 :       emit_insn (gen_rtx_SET (target, x));
   17881         2646 :       return true;
   17882              : 
   17883          309 :     case E_V4SFmode:
   17884          309 :     case E_V4SImode:
   17885          309 :       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
   17886            0 :         new_target = gen_reg_rtx (mode);
   17887              :       else
   17888              :         new_target = target;
   17889          618 :       var = force_reg (GET_MODE_INNER (mode), var);
   17890          309 :       x = gen_rtx_VEC_DUPLICATE (mode, var);
   17891          309 :       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
   17892          309 :       emit_insn (gen_rtx_SET (new_target, x));
   17893          309 :       if (one_var != 0)
   17894              :         {
   17895              :           /* We need to shuffle the value to the correct position, so
   17896              :              create a new pseudo to store the intermediate result.  */
   17897              : 
   17898              :           /* With SSE2, we can use the integer shuffle insns.  */
   17899           40 :           if (mode != V4SFmode && TARGET_SSE2)
   17900              :             {
   17901           27 :               emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
   17902              :                                             const1_rtx,
   17903           27 :                                             GEN_INT (one_var == 1 ? 0 : 1),
   17904           27 :                                             GEN_INT (one_var == 2 ? 0 : 1),
   17905           27 :                                             GEN_INT (one_var == 3 ? 0 : 1)));
   17906           27 :               if (target != new_target)
   17907            0 :                 emit_move_insn (target, new_target);
   17908           27 :               return true;
   17909              :             }
   17910              : 
   17911              :           /* Otherwise convert the intermediate result to V4SFmode and
   17912              :              use the SSE1 shuffle instructions.  */
   17913            0 :           if (mode != V4SFmode)
   17914              :             {
   17915            0 :               tmp = gen_reg_rtx (V4SFmode);
   17916            0 :               emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
   17917              :             }
   17918              :           else
   17919              :             tmp = new_target;
   17920              : 
   17921           43 :           emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
   17922              :                                        const1_rtx,
   17923           13 :                                        GEN_INT (one_var == 1 ? 0 : 1),
   17924              :                                        GEN_INT (one_var == 2 ? 0+4 : 1+4),
   17925              :                                        GEN_INT (one_var == 3 ? 0+4 : 1+4)));
   17926              : 
   17927           13 :           if (mode != V4SFmode)
   17928            0 :             emit_move_insn (target, gen_lowpart (V4SImode, tmp));
   17929           13 :           else if (tmp != target)
   17930            0 :             emit_move_insn (target, tmp);
   17931              :         }
   17932          269 :       else if (target != new_target)
   17933            0 :         emit_move_insn (target, new_target);
   17934              :       return true;
   17935              : 
   17936           12 :     case E_V8HImode:
   17937           12 :     case E_V16QImode:
   17938           12 :       vsimode = V4SImode;
   17939           12 :       goto widen;
   17940            3 :     case E_V4HImode:
   17941            3 :     case E_V8QImode:
   17942            3 :       if (!mmx_ok)
   17943              :         return false;
   17944            3 :       vsimode = V2SImode;
   17945            3 :       goto widen;
   17946           15 :     widen:
   17947           15 :       if (one_var != 0)
   17948              :         return false;
   17949              : 
   17950              :       /* Zero extend the variable element to SImode and recurse.  */
   17951           14 :       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
   17952              : 
   17953            7 :       x = gen_reg_rtx (vsimode);
   17954            7 :       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
   17955              :                                                 var, one_var))
   17956            0 :         gcc_unreachable ();
   17957              : 
   17958            7 :       emit_move_insn (target, gen_lowpart (mode, x));
   17959            7 :       return true;
   17960              : 
   17961              :     default:
   17962              :       return false;
   17963              :     }
   17964              : }
   17965              : 
   17966              : /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   17967              :    consisting of the values in VALS.  It is known that all elements
   17968              :    except ONE_VAR are constants.  Return true if successful.  */
   17969              : 
   17970              : static bool
   17971         7960 : ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
   17972              :                                  rtx target, rtx vals, int one_var)
   17973              : {
   17974         7960 :   rtx var = XVECEXP (vals, 0, one_var);
   17975         7960 :   machine_mode wmode;
   17976         7960 :   rtx const_vec, x;
   17977              : 
   17978         7960 :   const_vec = copy_rtx (vals);
   17979         7960 :   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
   17980         7960 :   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
   17981              : 
   17982         7960 :   switch (mode)
   17983              :     {
   17984              :     case E_V2DFmode:
   17985              :     case E_V2DImode:
   17986              :     case E_V2SFmode:
   17987              :     case E_V2SImode:
   17988              :       /* For the two element vectors, it's just as easy to use
   17989              :          the general case.  */
   17990              :       return false;
   17991              : 
   17992            3 :     case E_V4DImode:
   17993              :       /* Use ix86_expand_vector_set in 64bit mode only.  */
   17994            3 :       if (!TARGET_64BIT)
   17995              :         return false;
   17996              :       /* FALLTHRU */
   17997              :     case E_V8HFmode:
   17998              :     case E_V16HFmode:
   17999              :     case E_V8BFmode:
   18000              :     case E_V16BFmode:
   18001              :     case E_V4DFmode:
   18002              :     case E_V8SFmode:
   18003              :     case E_V8SImode:
   18004              :     case E_V16HImode:
   18005              :     case E_V32QImode:
   18006              :     case E_V4SFmode:
   18007              :     case E_V4SImode:
   18008              :     case E_V8HImode:
   18009              :     case E_V4HImode:
   18010              :     case E_V4HFmode:
   18011              :     case E_V4BFmode:
   18012              :       break;
   18013              : 
   18014            8 :     case E_V16QImode:
   18015            8 :       if (TARGET_SSE4_1)
   18016              :         break;
   18017            8 :       wmode = V8HImode;
   18018            8 :       goto widen;
   18019            1 :     case E_V8QImode:
   18020            1 :       if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
   18021              :         break;
   18022            1 :       wmode = V4HImode;
   18023            1 :       goto widen;
   18024           38 :     case E_V4QImode:
   18025           38 :       if (TARGET_SSE4_1)
   18026              :         break;
   18027              :       wmode = V2HImode;
   18028           47 :     widen:
   18029              :       /* There's no way to set one QImode entry easily.  Combine
   18030              :          the variable value with its adjacent constant value, and
   18031              :          promote to an HImode set.  */
   18032           47 :       x = XVECEXP (vals, 0, one_var ^ 1);
   18033           47 :       if (one_var & 1)
   18034              :         {
   18035           13 :           var = convert_modes (HImode, QImode, var, true);
   18036           13 :           var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
   18037              :                                      NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18038           13 :           x = GEN_INT (INTVAL (x) & 0xff);
   18039              :         }
   18040              :       else
   18041              :         {
   18042           34 :           var = convert_modes (HImode, QImode, var, true);
   18043           34 :           x = gen_int_mode (UINTVAL (x) << 8, HImode);
   18044              :         }
   18045           47 :       if (x != const0_rtx)
   18046            7 :         var = expand_simple_binop (HImode, IOR, var, x, var,
   18047              :                                    1, OPTAB_LIB_WIDEN);
   18048              : 
   18049           47 :       x = gen_reg_rtx (wmode);
   18050           47 :       emit_move_insn (x, gen_lowpart (wmode, const_vec));
   18051           47 :       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
   18052              : 
   18053           47 :       emit_move_insn (target, gen_lowpart (mode, x));
   18054           47 :       return true;
   18055              : 
   18056              :     default:
   18057              :       return false;
   18058              :     }
   18059              : 
   18060          191 :   emit_move_insn (target, const_vec);
   18061          191 :   ix86_expand_vector_set (mmx_ok, target, var, one_var);
   18062          191 :   return true;
   18063              : }
   18064              : 
   18065              : /* A subroutine of ix86_expand_vector_init_general.  Use vector
   18066              :    concatenate to handle the most general case: all values variable,
   18067              :    and none identical.  */
   18068              : 
   18069              : static void
   18070       118287 : ix86_expand_vector_init_concat (machine_mode mode,
   18071              :                                 rtx target, rtx *ops, int n)
   18072              : {
   18073       118287 :   machine_mode half_mode = VOIDmode;
   18074       118287 :   rtx half[2];
   18075       118287 :   rtvec v;
   18076       118287 :   int i, j;
   18077              : 
   18078       118287 :   switch (n)
   18079              :     {
   18080       110024 :     case 2:
   18081       110024 :       switch (mode)
   18082              :         {
   18083              :         case E_V32HFmode:
   18084              :           half_mode = V16HFmode;
   18085              :           break;
   18086            0 :         case E_V32BFmode:
   18087            0 :           half_mode = V16BFmode;
   18088            0 :           break;
   18089           81 :         case E_V16SImode:
   18090           81 :           half_mode = V8SImode;
   18091           81 :           break;
   18092           33 :         case E_V16SFmode:
   18093           33 :           half_mode = V8SFmode;
   18094           33 :           break;
   18095           92 :         case E_V8DImode:
   18096           92 :           half_mode = V4DImode;
   18097           92 :           break;
   18098           73 :         case E_V8DFmode:
   18099           73 :           half_mode = V4DFmode;
   18100           73 :           break;
   18101            0 :         case E_V16HFmode:
   18102            0 :           half_mode = V8HFmode;
   18103            0 :           break;
   18104            0 :         case E_V16BFmode:
   18105            0 :           half_mode = V8BFmode;
   18106            0 :           break;
   18107          197 :         case E_V8SImode:
   18108          197 :           half_mode = V4SImode;
   18109          197 :           break;
   18110          271 :         case E_V8SFmode:
   18111          271 :           half_mode = V4SFmode;
   18112          271 :           break;
   18113          308 :         case E_V4DImode:
   18114          308 :           half_mode = V2DImode;
   18115          308 :           break;
   18116          618 :         case E_V4DFmode:
   18117          618 :           half_mode = V2DFmode;
   18118          618 :           break;
   18119         5786 :         case E_V4SImode:
   18120         5786 :           half_mode = V2SImode;
   18121         5786 :           break;
   18122         2257 :         case E_V4SFmode:
   18123         2257 :           half_mode = V2SFmode;
   18124         2257 :           break;
   18125        64630 :         case E_V2DImode:
   18126        64630 :           half_mode = DImode;
   18127        64630 :           break;
   18128        26731 :         case E_V2SImode:
   18129        26731 :           half_mode = SImode;
   18130        26731 :           break;
   18131         3488 :         case E_V2DFmode:
   18132         3488 :           half_mode = DFmode;
   18133         3488 :           break;
   18134         5459 :         case E_V2SFmode:
   18135         5459 :           half_mode = SFmode;
   18136         5459 :           break;
   18137            0 :         default:
   18138            0 :           gcc_unreachable ();
   18139              :         }
   18140              : 
   18141       110024 :       if (!register_operand (ops[1], half_mode))
   18142        48265 :         ops[1] = force_reg (half_mode, ops[1]);
   18143       110024 :       if (!register_operand (ops[0], half_mode))
   18144        36948 :         ops[0] = force_reg (half_mode, ops[0]);
   18145       110024 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
   18146              :                                                           ops[1])));
   18147       110024 :       break;
   18148              : 
   18149         7625 :     case 4:
   18150         7625 :       switch (mode)
   18151              :         {
   18152              :         case E_V4DImode:
   18153              :           half_mode = V2DImode;
   18154              :           break;
   18155          535 :         case E_V4DFmode:
   18156          535 :           half_mode = V2DFmode;
   18157          535 :           break;
   18158         4824 :         case E_V4SImode:
   18159         4824 :           half_mode = V2SImode;
   18160         4824 :           break;
   18161         2084 :         case E_V4SFmode:
   18162         2084 :           half_mode = V2SFmode;
   18163         2084 :           break;
   18164            0 :         default:
   18165            0 :           gcc_unreachable ();
   18166              :         }
   18167         7625 :       goto half;
   18168              : 
   18169          545 :     case 8:
   18170          545 :       switch (mode)
   18171              :         {
   18172              :         case E_V8DImode:
   18173              :           half_mode = V4DImode;
   18174              :           break;
   18175           73 :         case E_V8DFmode:
   18176           73 :           half_mode = V4DFmode;
   18177           73 :           break;
   18178          156 :         case E_V8SImode:
   18179          156 :           half_mode = V4SImode;
   18180          156 :           break;
   18181          265 :         case E_V8SFmode:
   18182          265 :           half_mode = V4SFmode;
   18183          265 :           break;
   18184            0 :         default:
   18185            0 :           gcc_unreachable ();
   18186              :         }
   18187          545 :       goto half;
   18188              : 
   18189           93 :     case 16:
   18190           93 :       switch (mode)
   18191              :         {
   18192              :         case E_V16SImode:
   18193              :           half_mode = V8SImode;
   18194              :           break;
   18195           33 :         case E_V16SFmode:
   18196           33 :           half_mode = V8SFmode;
   18197           33 :           break;
   18198            0 :         default:
   18199            0 :           gcc_unreachable ();
   18200              :         }
   18201           93 :       goto half;
   18202              : 
   18203         8263 : half:
   18204              :       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
   18205         8263 :       i = n - 1;
   18206        24789 :       for (j = 1; j != -1; j--)
   18207              :         {
   18208        16526 :           half[j] = gen_reg_rtx (half_mode);
   18209        16526 :           switch (n >> 1)
   18210              :             {
   18211        15250 :             case 2:
   18212        15250 :               v = gen_rtvec (2, ops[i-1], ops[i]);
   18213        15250 :               i -= 2;
   18214        15250 :               break;
   18215         1090 :             case 4:
   18216         1090 :               v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
   18217         1090 :               i -= 4;
   18218         1090 :               break;
   18219          186 :             case 8:
   18220          372 :               v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
   18221          186 :                              ops[i-3], ops[i-2], ops[i-1], ops[i]);
   18222          186 :               i -= 8;
   18223          186 :               break;
   18224            0 :             default:
   18225            0 :               gcc_unreachable ();
   18226              :             }
   18227        16526 :           ix86_expand_vector_init (false, half[j],
   18228              :                                    gen_rtx_PARALLEL (half_mode, v));
   18229              :         }
   18230              : 
   18231         8263 :       ix86_expand_vector_init_concat (mode, target, half, 2);
   18232         8263 :       break;
   18233              : 
   18234            0 :     default:
   18235            0 :       gcc_unreachable ();
   18236              :     }
   18237       118287 : }
   18238              : 
   18239              : /* A subroutine of ix86_expand_vector_init_general.  Use vector
   18240              :    interleave to handle the most general case: all values variable,
   18241              :    and none identical.  */
   18242              : 
   18243              : static void
   18244         3826 : ix86_expand_vector_init_interleave (machine_mode mode,
   18245              :                                     rtx target, rtx *ops, int n)
   18246              : {
   18247         3826 :   machine_mode first_imode, second_imode, third_imode, inner_mode;
   18248         3826 :   int i, j;
   18249         3826 :   rtx op, op0, op1;
   18250         3826 :   rtx (*gen_load_even) (rtx, rtx, rtx);
   18251         3826 :   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
   18252         3826 :   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
   18253              : 
   18254         3826 :   switch (mode)
   18255              :     {
   18256              :     case E_V8HFmode:
   18257              :       gen_load_even = gen_vec_interleave_lowv8hf;
   18258              :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18259              :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18260              :       inner_mode = HFmode;
   18261              :       first_imode = V4SImode;
   18262              :       second_imode = V2DImode;
   18263              :       third_imode = VOIDmode;
   18264              :       break;
   18265          487 :     case E_V8BFmode:
   18266          487 :       gen_load_even = gen_vec_interleave_lowv8bf;
   18267          487 :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18268          487 :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18269          487 :       inner_mode = BFmode;
   18270          487 :       first_imode = V4SImode;
   18271          487 :       second_imode = V2DImode;
   18272          487 :       third_imode = VOIDmode;
   18273          487 :       break;
   18274          739 :     case E_V8HImode:
   18275          739 :       gen_load_even = gen_vec_setv8hi;
   18276          739 :       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   18277          739 :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18278          739 :       inner_mode = HImode;
   18279          739 :       first_imode = V4SImode;
   18280          739 :       second_imode = V2DImode;
   18281          739 :       third_imode = VOIDmode;
   18282          739 :       break;
   18283          374 :     case E_V16QImode:
   18284          374 :       gen_load_even = gen_vec_setv16qi;
   18285          374 :       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
   18286          374 :       gen_interleave_second_low = gen_vec_interleave_lowv4si;
   18287          374 :       inner_mode = QImode;
   18288          374 :       first_imode = V8HImode;
   18289          374 :       second_imode = V4SImode;
   18290          374 :       third_imode = V2DImode;
   18291          374 :       break;
   18292            0 :     default:
   18293            0 :       gcc_unreachable ();
   18294              :     }
   18295              : 
   18296        20626 :   for (i = 0; i < n; i++)
   18297              :     {
   18298        16800 :       op = ops [i + i];
   18299        16800 :       if (inner_mode == HFmode || inner_mode == BFmode)
   18300              :         {
   18301        10852 :           rtx even, odd;
   18302              :           /* Use vpuncklwd to pack 2 HFmode or BFmode.  */
   18303         1948 :           machine_mode vec_mode =
   18304        10852 :             (inner_mode == HFmode) ? V8HFmode : V8BFmode;
   18305        10852 :           op0 = gen_reg_rtx (vec_mode);
   18306        10852 :           even = lowpart_subreg (vec_mode,
   18307              :                                  force_reg (inner_mode, op), inner_mode);
   18308        10852 :           odd = lowpart_subreg (vec_mode,
   18309        10852 :                                 force_reg (inner_mode, ops[i + i + 1]),
   18310              :                                 inner_mode);
   18311        10852 :           emit_insn (gen_load_even (op0, even, odd));
   18312              :         }
   18313              :       else
   18314              :         {
   18315              :           /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
   18316         5948 :           op0 = gen_reg_rtx (SImode);
   18317         5948 :           emit_move_insn (op0, gen_lowpart (SImode, op));
   18318              : 
   18319              :           /* Insert the SImode value as low element of V4SImode vector.  */
   18320         5948 :           op1 = gen_reg_rtx (V4SImode);
   18321         5948 :           op0 = gen_rtx_VEC_MERGE (V4SImode,
   18322              :                                    gen_rtx_VEC_DUPLICATE (V4SImode,
   18323              :                                                           op0),
   18324              :                                    CONST0_RTX (V4SImode),
   18325              :                                    const1_rtx);
   18326         5948 :           emit_insn (gen_rtx_SET (op1, op0));
   18327              : 
   18328              :           /* Cast the V4SImode vector back to a vector in orignal mode.  */
   18329         5948 :           op0 = gen_reg_rtx (mode);
   18330         5948 :           emit_move_insn (op0, gen_lowpart (mode, op1));
   18331              : 
   18332              :           /* Load even elements into the second position.  */
   18333         5948 :           emit_insn (gen_load_even (op0,
   18334              :                                     force_reg (inner_mode,
   18335         5948 :                                                ops[i + i + 1]),
   18336              :                                     const1_rtx));
   18337              :         }
   18338              : 
   18339              :       /* Cast vector to FIRST_IMODE vector.  */
   18340        16800 :       ops[i] = gen_reg_rtx (first_imode);
   18341        16800 :       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
   18342              :     }
   18343              : 
   18344              :   /* Interleave low FIRST_IMODE vectors.  */
   18345        12226 :   for (i = j = 0; i < n; i += 2, j++)
   18346              :     {
   18347         8400 :       op0 = gen_reg_rtx (first_imode);
   18348         8400 :       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
   18349              : 
   18350              :       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
   18351         8400 :       ops[j] = gen_reg_rtx (second_imode);
   18352         8400 :       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
   18353              :     }
   18354              : 
   18355              :   /* Interleave low SECOND_IMODE vectors.  */
   18356         3826 :   switch (second_imode)
   18357              :     {
   18358              :     case E_V4SImode:
   18359         1122 :       for (i = j = 0; i < n / 2; i += 2, j++)
   18360              :         {
   18361          748 :           op0 = gen_reg_rtx (second_imode);
   18362          748 :           emit_insn (gen_interleave_second_low (op0, ops[i],
   18363          748 :                                                 ops[i + 1]));
   18364              : 
   18365              :           /* Cast the SECOND_IMODE vector to the THIRD_IMODE
   18366              :              vector.  */
   18367          748 :           ops[j] = gen_reg_rtx (third_imode);
   18368          748 :           emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
   18369              :         }
   18370              :       second_imode = V2DImode;
   18371              :       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   18372              :       /* FALLTHRU */
   18373              : 
   18374         3826 :     case E_V2DImode:
   18375         3826 :       op0 = gen_reg_rtx (second_imode);
   18376         3826 :       emit_insn (gen_interleave_second_low (op0, ops[0],
   18377              :                                             ops[1]));
   18378              : 
   18379              :       /* Cast the SECOND_IMODE vector back to a vector on original
   18380              :          mode.  */
   18381         3826 :       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
   18382         3826 :       break;
   18383              : 
   18384              :     default:
   18385              :       gcc_unreachable ();
   18386              :     }
   18387         3826 : }
   18388              : 
   18389              : /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
   18390              :    all values variable, and none identical.  */
   18391              : 
   18392              : static void
   18393       118855 : ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
   18394              :                                  rtx target, rtx vals)
   18395              : {
   18396       118855 :   rtx ops[64], op0, op1, op2, op3, op4, op5;
   18397       118855 :   machine_mode half_mode = VOIDmode;
   18398       118855 :   machine_mode quarter_mode = VOIDmode;
   18399       118855 :   machine_mode int_inner_mode = VOIDmode;
   18400       118855 :   int n, i;
   18401              : 
   18402       118855 :   switch (mode)
   18403              :     {
   18404        32190 :     case E_V2SFmode:
   18405        32190 :     case E_V2SImode:
   18406        32190 :       if (!mmx_ok && !TARGET_SSE)
   18407              :         break;
   18408              :       /* FALLTHRU */
   18409              : 
   18410       108571 :     case E_V16SImode:
   18411       108571 :     case E_V16SFmode:
   18412       108571 :     case E_V8DFmode:
   18413       108571 :     case E_V8DImode:
   18414       108571 :     case E_V8SFmode:
   18415       108571 :     case E_V8SImode:
   18416       108571 :     case E_V4DFmode:
   18417       108571 :     case E_V4DImode:
   18418       108571 :     case E_V4SFmode:
   18419       108571 :     case E_V4SImode:
   18420       108571 :     case E_V2DFmode:
   18421       108571 :     case E_V2DImode:
   18422       108571 :       n = GET_MODE_NUNITS (mode);
   18423       345535 :       for (i = 0; i < n; i++)
   18424       236964 :         ops[i] = XVECEXP (vals, 0, i);
   18425       108571 :       ix86_expand_vector_init_concat (mode, target, ops, n);
   18426       219192 :       return;
   18427              : 
   18428              :     case E_V2TImode:
   18429          135 :       for (i = 0; i < 2; i++)
   18430           90 :         ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
   18431           45 :       op0 = gen_reg_rtx (V4DImode);
   18432           45 :       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
   18433           45 :       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
   18434           45 :       return;
   18435              : 
   18436              :     case E_V4TImode:
   18437          195 :       for (i = 0; i < 4; i++)
   18438          156 :         ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
   18439           39 :       ops[4] = gen_reg_rtx (V4DImode);
   18440           39 :       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
   18441           39 :       ops[5] = gen_reg_rtx (V4DImode);
   18442           39 :       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
   18443           39 :       op0 = gen_reg_rtx (V8DImode);
   18444           39 :       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
   18445           39 :       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
   18446           39 :       return;
   18447              : 
   18448           69 :     case E_V32QImode:
   18449           69 :       half_mode = V16QImode;
   18450           69 :       goto half;
   18451              : 
   18452           64 :     case E_V16HImode:
   18453           64 :       half_mode = V8HImode;
   18454           64 :       goto half;
   18455              : 
   18456          237 :     case E_V16HFmode:
   18457          237 :       half_mode = V8HFmode;
   18458          237 :       goto half;
   18459              : 
   18460           95 :     case E_V16BFmode:
   18461           95 :       half_mode = V8BFmode;
   18462           95 :       goto half;
   18463              : 
   18464          465 : half:
   18465          465 :       n = GET_MODE_NUNITS (mode);
   18466         9009 :       for (i = 0; i < n; i++)
   18467         8544 :         ops[i] = XVECEXP (vals, 0, i);
   18468          465 :       op0 = gen_reg_rtx (half_mode);
   18469          465 :       op1 = gen_reg_rtx (half_mode);
   18470          465 :       ix86_expand_vector_init_interleave (half_mode, op0, ops,
   18471              :                                           n >> 2);
   18472          465 :       ix86_expand_vector_init_interleave (half_mode, op1,
   18473          465 :                                           &ops [n >> 1], n >> 2);
   18474          465 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
   18475          465 :       return;
   18476              : 
   18477           56 :     case E_V64QImode:
   18478           56 :       quarter_mode = V16QImode;
   18479           56 :       half_mode = V32QImode;
   18480           56 :       goto quarter;
   18481              : 
   18482           71 :     case E_V32HImode:
   18483           71 :       quarter_mode = V8HImode;
   18484           71 :       half_mode = V16HImode;
   18485           71 :       goto quarter;
   18486              : 
   18487          287 :     case E_V32HFmode:
   18488          287 :       quarter_mode = V8HFmode;
   18489          287 :       half_mode = V16HFmode;
   18490          287 :       goto quarter;
   18491              : 
   18492           51 :     case E_V32BFmode:
   18493           51 :       quarter_mode = V8BFmode;
   18494           51 :       half_mode = V16BFmode;
   18495           51 :       goto quarter;
   18496              : 
   18497          465 : quarter:
   18498          465 :       n = GET_MODE_NUNITS (mode);
   18499        17137 :       for (i = 0; i < n; i++)
   18500        16672 :         ops[i] = XVECEXP (vals, 0, i);
   18501          465 :       op0 = gen_reg_rtx (quarter_mode);
   18502          465 :       op1 = gen_reg_rtx (quarter_mode);
   18503          465 :       op2 = gen_reg_rtx (quarter_mode);
   18504          465 :       op3 = gen_reg_rtx (quarter_mode);
   18505          465 :       op4 = gen_reg_rtx (half_mode);
   18506          465 :       op5 = gen_reg_rtx (half_mode);
   18507          465 :       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
   18508              :                                           n >> 3);
   18509          465 :       ix86_expand_vector_init_interleave (quarter_mode, op1,
   18510          465 :                                           &ops [n >> 2], n >> 3);
   18511          465 :       ix86_expand_vector_init_interleave (quarter_mode, op2,
   18512          465 :                                           &ops [n >> 1], n >> 3);
   18513          465 :       ix86_expand_vector_init_interleave (quarter_mode, op3,
   18514          465 :                                           &ops [(n >> 1) | (n >> 2)], n >> 3);
   18515          465 :       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
   18516          465 :       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
   18517          465 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
   18518          465 :       return;
   18519              : 
   18520          261 :     case E_V16QImode:
   18521          261 :       if (!TARGET_SSE4_1)
   18522              :         break;
   18523              :       /* FALLTHRU */
   18524              : 
   18525          463 :     case E_V8HImode:
   18526          463 :       if (!TARGET_SSE2)
   18527              :         break;
   18528              : 
   18529              :       /* Don't use ix86_expand_vector_init_interleave if we can't
   18530              :          move from GPR to SSE register directly.  */
   18531          463 :       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
   18532              :         break;
   18533              :       /* FALLTHRU */
   18534              : 
   18535         1036 :     case E_V8HFmode:
   18536         1036 :     case E_V8BFmode:
   18537              : 
   18538         1036 :       n = GET_MODE_NUNITS (mode);
   18539         9420 :       for (i = 0; i < n; i++)
   18540         8384 :         ops[i] = XVECEXP (vals, 0, i);
   18541         1036 :       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
   18542         1036 :       return;
   18543              : 
   18544              :     case E_V4HFmode:
   18545              :     case E_V4BFmode:
   18546              :     case E_V2HFmode:
   18547              :     case E_V2BFmode:
   18548         8234 :       int_inner_mode = HImode;
   18549              :       break;
   18550              : 
   18551              :     case E_V4HImode:
   18552              :     case E_V8QImode:
   18553              : 
   18554              :     case E_V2HImode:
   18555              :     case E_V4QImode:
   18556              :       break;
   18557              : 
   18558            0 :     default:
   18559            0 :       gcc_unreachable ();
   18560              :     }
   18561              : 
   18562         8234 :     {
   18563         8234 :       int i, j, n_elts, n_words, n_elt_per_word;
   18564         8234 :       machine_mode tmp_mode, inner_mode;
   18565         8234 :       rtx words[4], shift;
   18566              : 
   18567        16545 :       tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
   18568              : 
   18569         8234 :       inner_mode = GET_MODE_INNER (mode);
   18570         8234 :       n_elts = GET_MODE_NUNITS (mode);
   18571        16468 :       n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
   18572         8234 :       n_elt_per_word = n_elts / n_words;
   18573         8234 :       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
   18574              : 
   18575        16795 :       for (i = 0; i < n_words; ++i)
   18576              :         {
   18577              :           rtx word = NULL_RTX;
   18578              : 
   18579        44933 :           for (j = 0; j < n_elt_per_word; ++j)
   18580              :             {
   18581        36372 :               rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
   18582        36372 :               if (int_inner_mode != E_VOIDmode)
   18583              :                 {
   18584          138 :                   gcc_assert (TARGET_SSE2 && int_inner_mode == HImode);
   18585          138 :                   rtx tmp = gen_reg_rtx (int_inner_mode);
   18586          138 :                   elt = lowpart_subreg (int_inner_mode,
   18587              :                                         force_reg (inner_mode, elt),
   18588              :                                         inner_mode);
   18589          138 :                   emit_move_insn (tmp, elt);
   18590          138 :                   elt = tmp;
   18591              :                 }
   18592        36372 :               elt = convert_modes (tmp_mode, inner_mode, elt, true);
   18593              : 
   18594        36372 :               if (j == 0)
   18595              :                 word = elt;
   18596              :               else
   18597              :                 {
   18598        27811 :                   word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
   18599              :                                               NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18600        27811 :                   word = expand_simple_binop (tmp_mode, IOR, word, elt,
   18601              :                                               NULL_RTX, 1, OPTAB_LIB_WIDEN);
   18602              :                 }
   18603              :             }
   18604              : 
   18605         8561 :           words[i] = word;
   18606              :         }
   18607              : 
   18608         8234 :       if (n_words == 1)
   18609         7907 :         emit_move_insn (target, gen_lowpart (mode, words[0]));
   18610          327 :       else if (n_words == 2)
   18611              :         {
   18612          327 :           gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
   18613          327 :           machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
   18614          327 :           rtx tmp = gen_reg_rtx (concat_mode);
   18615          327 :           vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
   18616          327 :           ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
   18617          327 :           emit_move_insn (target, gen_lowpart (mode, tmp));
   18618              :         }
   18619            0 :       else if (n_words == 4)
   18620              :         {
   18621            0 :           rtx tmp = gen_reg_rtx (V4SImode);
   18622            0 :           gcc_assert (tmp_mode == SImode);
   18623            0 :           vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
   18624            0 :           ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
   18625            0 :           emit_move_insn (target, gen_lowpart (mode, tmp));
   18626              :         }
   18627              :       else
   18628            0 :         gcc_unreachable ();
   18629              :     }
   18630              : }
   18631              : 
   18632              : /* Initialize vector TARGET via VALS.  Suppress the use of MMX
   18633              :    instructions unless MMX_OK is true.  */
   18634              : 
   18635              : void
   18636       130214 : ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
   18637              : {
   18638       130214 :   machine_mode mode = GET_MODE (target);
   18639       130214 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   18640       130214 :   int n_elts = GET_MODE_NUNITS (mode);
   18641       130214 :   int n_var = 0, one_var = -1;
   18642       130214 :   bool all_same = true, all_const_zero = true;
   18643       130214 :   int i;
   18644       130214 :   rtx x;
   18645              : 
   18646              :   /* Handle first initialization from vector elts.  */
   18647       130214 :   if (n_elts != XVECLEN (vals, 0))
   18648              :     {
   18649         1291 :       rtx subtarget = target;
   18650         1291 :       x = XVECEXP (vals, 0, 0);
   18651         2582 :       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
   18652         2582 :       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
   18653              :         {
   18654         1291 :           rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
   18655         1291 :           if (inner_mode == QImode
   18656         1291 :               || inner_mode == HImode
   18657         1291 :               || inner_mode == TImode
   18658              :               || inner_mode == HFmode
   18659              :               || inner_mode == BFmode)
   18660              :             {
   18661          134 :               unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
   18662          134 :               scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
   18663          134 :               n_bits /= GET_MODE_SIZE (elt_mode);
   18664          134 :               mode = mode_for_vector (elt_mode, n_bits).require ();
   18665          134 :               inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
   18666          134 :               ops[0] = gen_lowpart (inner_mode, ops[0]);
   18667          134 :               ops[1] = gen_lowpart (inner_mode, ops[1]);
   18668          134 :               subtarget = gen_reg_rtx (mode);
   18669              :             }
   18670         1291 :           ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
   18671         1291 :           if (subtarget != target)
   18672          134 :             emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
   18673         1291 :           return;
   18674              :         }
   18675            0 :       gcc_unreachable ();
   18676              :     }
   18677              : 
   18678       473193 :   for (i = 0; i < n_elts; ++i)
   18679              :     {
   18680       344270 :       x = XVECEXP (vals, 0, i);
   18681       668025 :       if (!(CONST_SCALAR_INT_P (x)
   18682       327728 :             || CONST_DOUBLE_P (x)
   18683              :             || CONST_FIXED_P (x)))
   18684       323755 :         n_var++, one_var = i;
   18685        20515 :       else if (x != CONST0_RTX (inner_mode))
   18686         3248 :         all_const_zero = false;
   18687       344270 :       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
   18688              :         all_same = false;
   18689              :     }
   18690              : 
   18691              :   /* Handle the zero vector as special case.  */
   18692       128923 :   if (n_var == 0 && all_const_zero)
   18693              :     {
   18694          302 :       emit_move_insn (target, CONST0_RTX (mode));
   18695          302 :       return;
   18696              :     }
   18697              : 
   18698              :   /* If all values are identical, broadcast the value.  */
   18699       128621 :   if (all_same
   18700       135790 :       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
   18701         7169 :                                             XVECEXP (vals, 0, 0)))
   18702              :     return;
   18703              : 
   18704              :   /* Constants are best loaded from the constant pool.  */
   18705       122623 :   if (n_var == 0)
   18706              :     {
   18707           41 :       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
   18708           41 :       return;
   18709              :     }
   18710              : 
   18711              :   /* Values where only one field is non-constant are best loaded from
   18712              :      the pool and overwritten via move later.  */
   18713       122582 :   if (n_var == 1)
   18714              :     {
   18715        11776 :       if (all_const_zero
   18716        22401 :           && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
   18717        10625 :                                                   XVECEXP (vals, 0, one_var),
   18718              :                                                   one_var))
   18719              :         return;
   18720              : 
   18721         7960 :       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
   18722              :         return;
   18723              :     }
   18724              : 
   18725       118528 :   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
   18726              : }
   18727              : 
   18728              : /* Implemented as
   18729              :    V setg (V v, int idx, T val)
   18730              :    {
   18731              :      V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
   18732              :      V valv = (V){val, val, val, val, val, val, val, val};
   18733              :      V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
   18734              :      v = (v & ~mask) | (valv & mask);
   18735              :      return v;
   18736              :    }.  */
   18737              : void
   18738          129 : ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
   18739              : {
   18740          129 :   rtx vec[64];
   18741          129 :   machine_mode mode = GET_MODE (target);
   18742          129 :   machine_mode cmp_mode = mode;
   18743          129 :   int n_elts = GET_MODE_NUNITS (mode);
   18744          129 :   rtx valv,idxv,constv,idx_tmp;
   18745          129 :   bool ok = false;
   18746              : 
   18747              :   /* 512-bits vector byte/word broadcast and comparison only available
   18748              :      under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
   18749              :      when without TARGET_AVX512BW.  */
   18750          129 :   if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
   18751          123 :        || mode == V64QImode)
   18752           10 :       && !TARGET_AVX512BW)
   18753              :     {
   18754            3 :       gcc_assert (TARGET_AVX512F);
   18755            3 :       rtx vhi, vlo, idx_hi;
   18756            3 :       machine_mode half_mode;
   18757            3 :       rtx (*extract_hi)(rtx, rtx);
   18758            3 :       rtx (*extract_lo)(rtx, rtx);
   18759              : 
   18760            3 :       if (mode == V32HImode)
   18761              :         {
   18762              :           half_mode = V16HImode;
   18763              :           extract_hi = gen_vec_extract_hi_v32hi;
   18764              :           extract_lo = gen_vec_extract_lo_v32hi;
   18765              :         }
   18766              :       else if (mode == V32HFmode)
   18767              :         {
   18768              :           half_mode = V16HFmode;
   18769              :           extract_hi = gen_vec_extract_hi_v32hf;
   18770              :           extract_lo = gen_vec_extract_lo_v32hf;
   18771              :         }
   18772              :       else if (mode == V32BFmode)
   18773              :         {
   18774              :           half_mode = V16BFmode;
   18775              :           extract_hi = gen_vec_extract_hi_v32bf;
   18776              :           extract_lo = gen_vec_extract_lo_v32bf;
   18777              :         }
   18778              :       else
   18779              :         {
   18780            3 :           half_mode = V32QImode;
   18781            3 :           extract_hi = gen_vec_extract_hi_v64qi;
   18782            3 :           extract_lo = gen_vec_extract_lo_v64qi;
   18783              :         }
   18784              : 
   18785            3 :       vhi = gen_reg_rtx (half_mode);
   18786            3 :       vlo = gen_reg_rtx (half_mode);
   18787            3 :       idx_hi = gen_reg_rtx (GET_MODE (idx));
   18788            3 :       emit_insn (extract_hi (vhi, target));
   18789            3 :       emit_insn (extract_lo (vlo, target));
   18790            3 :       vec[0] = idx_hi;
   18791            3 :       vec[1] = idx;
   18792            3 :       vec[2] = GEN_INT (n_elts/2);
   18793            3 :       ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
   18794            3 :       ix86_expand_vector_set_var (vhi, val, idx_hi);
   18795            3 :       ix86_expand_vector_set_var (vlo, val, idx);
   18796            3 :       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
   18797            3 :       return;
   18798              :     }
   18799              : 
   18800          504 :   if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
   18801              :     {
   18802           42 :       switch (mode)
   18803              :         {
   18804              :         case E_V2DFmode:
   18805              :           cmp_mode = V2DImode;
   18806              :           break;
   18807            6 :         case E_V4DFmode:
   18808            6 :           cmp_mode = V4DImode;
   18809            6 :           break;
   18810            4 :         case E_V8DFmode:
   18811            4 :           cmp_mode = V8DImode;
   18812            4 :           break;
   18813            2 :         case E_V2SFmode:
   18814            2 :           cmp_mode = V2SImode;
   18815            2 :           break;
   18816            6 :         case E_V4SFmode:
   18817            6 :           cmp_mode = V4SImode;
   18818            6 :           break;
   18819            6 :         case E_V8SFmode:
   18820            6 :           cmp_mode = V8SImode;
   18821            6 :           break;
   18822            5 :         case E_V16SFmode:
   18823            5 :           cmp_mode = V16SImode;
   18824            5 :           break;
   18825            1 :         case E_V2HFmode:
   18826            1 :         case E_V2BFmode:
   18827            1 :           cmp_mode = V2HImode;
   18828            1 :           break;
   18829            1 :         case E_V4HFmode:
   18830            1 :         case E_V4BFmode:
   18831            1 :           cmp_mode = V4HImode;
   18832            1 :           break;
   18833              :         case E_V8HFmode:
   18834            2 :           cmp_mode = V8HImode;
   18835              :           break;
   18836              :         case E_V16HFmode:
   18837            2 :           cmp_mode = V16HImode;
   18838              :           break;
   18839              :         case E_V32HFmode:
   18840            1 :           cmp_mode = V32HImode;
   18841              :           break;
   18842              :         case E_V8BFmode:
   18843            2 :           cmp_mode = V8HImode;
   18844              :           break;
   18845              :         case E_V16BFmode:
   18846            2 :           cmp_mode = V16HImode;
   18847              :           break;
   18848              :         case E_V32BFmode:
   18849            1 :           cmp_mode = V32HImode;
   18850              :           break;
   18851            0 :         default:
   18852            0 :           gcc_unreachable ();
   18853              :         }
   18854              :     }
   18855              : 
   18856         1604 :   for (int i = 0; i != n_elts; i++)
   18857         1478 :     vec[i] = GEN_INT (i);
   18858          126 :   constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
   18859          126 :   valv = gen_reg_rtx (mode);
   18860          126 :   idxv = gen_reg_rtx (cmp_mode);
   18861          252 :   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
   18862              : 
   18863          126 :   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
   18864              :                                           mode, valv, val);
   18865          126 :   gcc_assert (ok);
   18866          126 :   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
   18867              :                                           cmp_mode, idxv, idx_tmp);
   18868          126 :   gcc_assert (ok);
   18869          126 :   vec[0] = target;
   18870          126 :   vec[1] = valv;
   18871          126 :   vec[2] = target;
   18872          126 :   vec[3] = gen_rtx_EQ (mode, idxv, constv);
   18873          126 :   vec[4] = idxv;
   18874          126 :   vec[5] = constv;
   18875          126 :   ok = ix86_expand_int_vcond (vec);
   18876          126 :   gcc_assert (ok);
   18877              : }
   18878              : 
   18879              : void
   18880         8170 : ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
   18881              : {
   18882         8170 :   machine_mode mode = GET_MODE (target);
   18883         8170 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   18884         8170 :   machine_mode half_mode;
   18885         8170 :   bool use_vec_merge = false;
   18886         8170 :   bool blendm_const = false;
   18887         8170 :   rtx tmp;
   18888         8170 :   static rtx (*gen_extract[8][2]) (rtx, rtx)
   18889              :     = {
   18890              :         { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
   18891              :         { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
   18892              :         { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
   18893              :         { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
   18894              :         { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
   18895              :         { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
   18896              :         { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
   18897              :         { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
   18898              :       };
   18899         8170 :   static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
   18900              :     = {
   18901              :         { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
   18902              :         { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
   18903              :         { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
   18904              :         { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
   18905              :         { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
   18906              :         { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
   18907              :         { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
   18908              :         { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
   18909              :       };
   18910         8170 :   int i, j, n;
   18911         8170 :   machine_mode mmode = VOIDmode;
   18912         8170 :   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
   18913              : 
   18914         8170 :   switch (mode)
   18915              :     {
   18916          187 :     case E_V2SImode:
   18917          187 :       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   18918              :       if (use_vec_merge)
   18919              :         break;
   18920              :       /* FALLTHRU */
   18921              : 
   18922          168 :     case E_V2SFmode:
   18923          168 :       if (mmx_ok)
   18924              :         {
   18925          168 :           tmp = gen_reg_rtx (GET_MODE_INNER (mode));
   18926          168 :           ix86_expand_vector_extract (true, tmp, target, 1 - elt);
   18927          168 :           if (elt == 0)
   18928            1 :             tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
   18929              :           else
   18930          167 :             tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
   18931          168 :           emit_insn (gen_rtx_SET (target, tmp));
   18932          168 :           return;
   18933              :         }
   18934              :       break;
   18935              : 
   18936          241 :     case E_V2DImode:
   18937          241 :       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
   18938           99 :       if (use_vec_merge)
   18939              :         break;
   18940              : 
   18941           99 :       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
   18942           99 :       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
   18943           99 :       if (elt == 0)
   18944           77 :         tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
   18945              :       else
   18946           22 :         tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
   18947           99 :       emit_insn (gen_rtx_SET (target, tmp));
   18948           99 :       return;
   18949              : 
   18950          153 :     case E_V2DFmode:
   18951              :       /* NB: For ELT == 0, use standard scalar operation patterns which
   18952              :          preserve the rest of the vector for combiner:
   18953              : 
   18954              :          (vec_merge:V2DF
   18955              :            (vec_duplicate:V2DF (reg:DF))
   18956              :            (reg:V2DF)
   18957              :            (const_int 1))
   18958              :        */
   18959          153 :       if (elt == 0)
   18960           68 :         goto do_vec_merge;
   18961              : 
   18962           85 :       {
   18963           85 :         rtx op0, op1;
   18964              : 
   18965              :         /* For the two element vectors, we implement a VEC_CONCAT with
   18966              :            the extraction of the other element.  */
   18967              : 
   18968           85 :         tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
   18969           85 :         tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
   18970              : 
   18971           85 :         if (elt == 0)
   18972              :           op0 = val, op1 = tmp;
   18973              :         else
   18974           85 :           op0 = tmp, op1 = val;
   18975              : 
   18976           85 :         tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
   18977           85 :         emit_insn (gen_rtx_SET (target, tmp));
   18978              :       }
   18979           85 :       return;
   18980              : 
   18981          574 :     case E_V4SFmode:
   18982          574 :       use_vec_merge = TARGET_SSE4_1;
   18983          574 :       if (use_vec_merge)
   18984              :         break;
   18985              : 
   18986           62 :       switch (elt)
   18987              :         {
   18988              :         case 0:
   18989              :           use_vec_merge = true;
   18990              :           break;
   18991              : 
   18992            1 :         case 1:
   18993              :           /* tmp = target = A B C D */
   18994            1 :           tmp = copy_to_reg (target);
   18995              :           /* target = A A B B */
   18996            1 :           emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
   18997              :           /* target = X A B B */
   18998            1 :           ix86_expand_vector_set (false, target, val, 0);
   18999              :           /* target = A X C D  */
   19000            1 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19001              :                                           const1_rtx, const0_rtx,
   19002              :                                           GEN_INT (2+4), GEN_INT (3+4)));
   19003            1 :           return;
   19004              : 
   19005            0 :         case 2:
   19006              :           /* tmp = target = A B C D */
   19007            0 :           tmp = copy_to_reg (target);
   19008              :           /* tmp = X B C D */
   19009            0 :           ix86_expand_vector_set (false, tmp, val, 0);
   19010              :           /* target = A B X D */
   19011            0 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19012              :                                           const0_rtx, const1_rtx,
   19013              :                                           GEN_INT (0+4), GEN_INT (3+4)));
   19014            0 :           return;
   19015              : 
   19016            4 :         case 3:
   19017              :           /* tmp = target = A B C D */
   19018            4 :           tmp = copy_to_reg (target);
   19019              :           /* tmp = X B C D */
   19020            4 :           ix86_expand_vector_set (false, tmp, val, 0);
   19021              :           /* target = A B X D */
   19022            4 :           emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   19023              :                                           const0_rtx, const1_rtx,
   19024              :                                           GEN_INT (2+4), GEN_INT (0+4)));
   19025            4 :           return;
   19026              : 
   19027            0 :         default:
   19028            0 :           gcc_unreachable ();
   19029              :         }
   19030              :       break;
   19031              : 
   19032          436 :     case E_V4SImode:
   19033          436 :       use_vec_merge = TARGET_SSE4_1;
   19034          436 :       if (use_vec_merge)
   19035              :         break;
   19036              : 
   19037              :       /* Element 0 handled by vec_merge below.  */
   19038          276 :       if (elt == 0)
   19039              :         {
   19040              :           use_vec_merge = true;
   19041              :           break;
   19042              :         }
   19043              : 
   19044           86 :       if (TARGET_SSE2)
   19045              :         {
   19046              :           /* With SSE2, use integer shuffles to swap element 0 and ELT,
   19047              :              store into element 0, then shuffle them back.  */
   19048              : 
   19049           86 :           rtx order[4];
   19050              : 
   19051           86 :           order[0] = GEN_INT (elt);
   19052           86 :           order[1] = const1_rtx;
   19053           86 :           order[2] = const2_rtx;
   19054           86 :           order[3] = GEN_INT (3);
   19055           86 :           order[elt] = const0_rtx;
   19056              : 
   19057           86 :           emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
   19058              :                                         order[1], order[2], order[3]));
   19059              : 
   19060           86 :           ix86_expand_vector_set (false, target, val, 0);
   19061              : 
   19062           86 :           emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
   19063              :                                         order[1], order[2], order[3]));
   19064              :         }
   19065              :       else
   19066              :         {
   19067              :           /* For SSE1, we have to reuse the V4SF code.  */
   19068            0 :           rtx t = gen_reg_rtx (V4SFmode);
   19069            0 :           emit_move_insn (t, gen_lowpart (V4SFmode, target));
   19070            0 :           ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
   19071            0 :           emit_move_insn (target, gen_lowpart (mode, t));
   19072              :         }
   19073              :       return;
   19074              : 
   19075         3316 :     case E_V8HImode:
   19076         3316 :     case E_V8HFmode:
   19077         3316 :     case E_V8BFmode:
   19078         3316 :     case E_V2HImode:
   19079         3316 :     case E_V2HFmode:
   19080         3316 :     case E_V2BFmode:
   19081         3316 :       use_vec_merge = TARGET_SSE2;
   19082         3316 :       break;
   19083           50 :     case E_V4HImode:
   19084           50 :     case E_V4HFmode:
   19085           50 :     case E_V4BFmode:
   19086           50 :       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
   19087              :       break;
   19088              : 
   19089         3067 :     case E_V16QImode:
   19090         3067 :     case E_V4QImode:
   19091         3067 :       use_vec_merge = TARGET_SSE4_1;
   19092         3067 :       break;
   19093              : 
   19094            5 :     case E_V8QImode:
   19095            5 :       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19096              :       break;
   19097              : 
   19098            3 :     case E_V32QImode:
   19099            3 :       half_mode = V16QImode;
   19100            3 :       j = 0;
   19101            3 :       n = 16;
   19102            3 :       goto half;
   19103              : 
   19104           17 :     case E_V16HFmode:
   19105           17 :     case E_V16BFmode:
   19106              :       /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw.  */
   19107           17 :       if (TARGET_AVX2 && elt != 0)
   19108              :         {
   19109           12 :           mmode = SImode;
   19110           12 :           gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
   19111              :                                                 : gen_avx2_pblendbf_1);
   19112              :           blendm_const = true;
   19113              :           break;
   19114              :         }
   19115              :       else
   19116              :         {
   19117            5 :           half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
   19118            3 :           j = ((mode == E_V16HFmode) ? 6 : 7);
   19119            5 :           n = 8;
   19120            5 :           goto half;
   19121              :         }
   19122              : 
   19123            5 :     case E_V16HImode:
   19124            5 :       half_mode = V8HImode;
   19125            5 :       j = 1;
   19126            5 :       n = 8;
   19127            5 :       goto half;
   19128              : 
   19129           15 :     case E_V8SImode:
   19130           15 :       half_mode = V4SImode;
   19131           15 :       j = 2;
   19132           15 :       n = 4;
   19133           15 :       goto half;
   19134              : 
   19135           15 :     case E_V4DImode:
   19136           15 :       half_mode = V2DImode;
   19137           15 :       j = 3;
   19138           15 :       n = 2;
   19139           15 :       goto half;
   19140              : 
   19141            4 :     case E_V8SFmode:
   19142            4 :       half_mode = V4SFmode;
   19143            4 :       j = 4;
   19144            4 :       n = 4;
   19145            4 :       goto half;
   19146              : 
   19147            6 :     case E_V4DFmode:
   19148            6 :       half_mode = V2DFmode;
   19149            6 :       j = 5;
   19150            6 :       n = 2;
   19151            6 :       goto half;
   19152              : 
   19153           53 : half:
   19154              :       /* Compute offset.  */
   19155           53 :       i = elt / n;
   19156           53 :       elt %= n;
   19157              : 
   19158           53 :       gcc_assert (i <= 1);
   19159              : 
   19160              :       /* Extract the half.  */
   19161           53 :       tmp = gen_reg_rtx (half_mode);
   19162           53 :       emit_insn (gen_extract[j][i] (tmp, target));
   19163              : 
   19164              :       /* Put val in tmp at elt.  */
   19165           53 :       ix86_expand_vector_set (false, tmp, val, elt);
   19166              : 
   19167              :       /* Put it back.  */
   19168           53 :       emit_insn (gen_insert[j][i] (target, target, tmp));
   19169           53 :       return;
   19170              : 
   19171            8 :     case E_V8DFmode:
   19172            8 :       if (TARGET_AVX512F)
   19173              :         {
   19174              :           mmode = QImode;
   19175              :           gen_blendm = gen_avx512f_blendmv8df;
   19176              :         }
   19177              :       break;
   19178              : 
   19179            6 :     case E_V8DImode:
   19180            6 :       if (TARGET_AVX512F)
   19181              :         {
   19182              :           mmode = QImode;
   19183              :           gen_blendm = gen_avx512f_blendmv8di;
   19184              :         }
   19185              :       break;
   19186              : 
   19187            0 :     case E_V16SFmode:
   19188            0 :       if (TARGET_AVX512F)
   19189              :         {
   19190              :           mmode = HImode;
   19191              :           gen_blendm = gen_avx512f_blendmv16sf;
   19192              :         }
   19193              :       break;
   19194              : 
   19195            0 :     case E_V16SImode:
   19196            0 :       if (TARGET_AVX512F)
   19197              :         {
   19198              :           mmode = HImode;
   19199              :           gen_blendm = gen_avx512f_blendmv16si;
   19200              :         }
   19201              :       break;
   19202              : 
   19203           12 :     case E_V32HFmode:
   19204           12 :       if (TARGET_AVX512BW)
   19205              :         {
   19206              :           mmode = SImode;
   19207              :           gen_blendm = gen_avx512bw_blendmv32hf;
   19208              :         }
   19209              :       break;
   19210           12 :     case E_V32BFmode:
   19211           12 :       if (TARGET_AVX512BW)
   19212              :         {
   19213              :           mmode = SImode;
   19214              :           gen_blendm = gen_avx512bw_blendmv32bf;
   19215              :         }
   19216              :       break;
   19217           11 :     case E_V32HImode:
   19218           11 :       if (TARGET_AVX512BW)
   19219              :         {
   19220              :           mmode = SImode;
   19221              :           gen_blendm = gen_avx512bw_blendmv32hi;
   19222              :         }
   19223            7 :       else if (TARGET_AVX512F)
   19224              :         {
   19225            7 :           half_mode = E_V8HImode;
   19226            7 :           n = 8;
   19227            7 :           goto quarter;
   19228              :         }
   19229              :       break;
   19230              : 
   19231           12 :     case E_V64QImode:
   19232           12 :       if (TARGET_AVX512BW)
   19233              :         {
   19234              :           mmode = DImode;
   19235              :           gen_blendm = gen_avx512bw_blendmv64qi;
   19236              :         }
   19237            6 :       else if (TARGET_AVX512F)
   19238              :         {
   19239            6 :           half_mode = E_V16QImode;
   19240            6 :           n = 16;
   19241            6 :           goto quarter;
   19242              :         }
   19243              :       break;
   19244              : 
   19245           13 : quarter:
   19246              :       /* Compute offset.  */
   19247           13 :       i = elt / n;
   19248           13 :       elt %= n;
   19249              : 
   19250           13 :       gcc_assert (i <= 3);
   19251              : 
   19252           13 :       {
   19253              :         /* Extract the quarter.  */
   19254           13 :         tmp = gen_reg_rtx (V4SImode);
   19255           13 :         rtx tmp2 = gen_lowpart (V16SImode, target);
   19256           13 :         rtx mask = gen_reg_rtx (QImode);
   19257              : 
   19258           13 :         emit_move_insn (mask, constm1_rtx);
   19259           13 :         emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
   19260              :                                                    tmp, mask));
   19261              : 
   19262           13 :         tmp2 = gen_reg_rtx (half_mode);
   19263           13 :         emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
   19264           13 :         tmp = tmp2;
   19265              : 
   19266              :         /* Put val in tmp at elt.  */
   19267           13 :         ix86_expand_vector_set (false, tmp, val, elt);
   19268              : 
   19269              :         /* Put it back.  */
   19270           13 :         tmp2 = gen_reg_rtx (V16SImode);
   19271           13 :         rtx tmp3 = gen_lowpart (V16SImode, target);
   19272           13 :         mask = gen_reg_rtx (HImode);
   19273           13 :         emit_move_insn (mask, constm1_rtx);
   19274           13 :         tmp = gen_lowpart (V4SImode, tmp);
   19275           13 :         emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
   19276              :                                                   tmp3, mask));
   19277           13 :         emit_move_insn (target, gen_lowpart (mode, tmp2));
   19278              :       }
   19279           13 :       return;
   19280              : 
   19281              :     default:
   19282              :       break;
   19283              :     }
   19284              : 
   19285         6383 :   if (mmode != VOIDmode)
   19286              :     {
   19287           54 :       tmp = gen_reg_rtx (mode);
   19288           54 :       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
   19289           54 :       rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
   19290              :       /* The avx512*_blendm<mode> expanders have different operand order
   19291              :          from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
   19292              :          elements where the mask is set and second input operand otherwise,
   19293              :          in {sse,avx}*_*blend* the first input operand is used for elements
   19294              :          where the mask is clear and second input operand otherwise.  */
   19295           54 :       if (!blendm_const)
   19296           42 :         merge_mask = force_reg (mmode, merge_mask);
   19297           54 :       emit_insn (gen_blendm (target, target, tmp, merge_mask));
   19298              :     }
   19299         7539 :   else if (use_vec_merge)
   19300              :     {
   19301         7529 : do_vec_merge:
   19302         7597 :       if (!nonimmediate_operand (val, inner_mode))
   19303            1 :         val = force_reg (inner_mode, val);
   19304         7597 :       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
   19305         7597 :       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
   19306              :                                GEN_INT (HOST_WIDE_INT_1U << elt));
   19307         7597 :       emit_insn (gen_rtx_SET (target, tmp));
   19308              :     }
   19309              :   else
   19310              :     {
   19311           20 :       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   19312              : 
   19313           10 :       emit_move_insn (mem, target);
   19314              : 
   19315           20 :       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
   19316           10 :       emit_move_insn (tmp, val);
   19317              : 
   19318           10 :       emit_move_insn (target, mem);
   19319              :     }
   19320              : }
   19321              : 
   19322              : void
   19323       104517 : ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
   19324              : {
   19325       104517 :   machine_mode mode = GET_MODE (vec);
   19326       104517 :   machine_mode inner_mode = GET_MODE_INNER (mode);
   19327       104517 :   bool use_vec_extr = false;
   19328       104517 :   rtx tmp;
   19329              : 
   19330       104517 :   switch (mode)
   19331              :     {
   19332         8485 :     case E_V2SImode:
   19333         8485 :       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19334              :       if (use_vec_extr)
   19335              :         break;
   19336              :       /* FALLTHRU */
   19337              : 
   19338         9367 :     case E_V2SFmode:
   19339         9367 :       if (!mmx_ok)
   19340              :         break;
   19341              :       /* FALLTHRU */
   19342              : 
   19343              :     case E_V2DFmode:
   19344              :     case E_V2DImode:
   19345              :     case E_V2TImode:
   19346              :     case E_V4TImode:
   19347              :       use_vec_extr = true;
   19348              :       break;
   19349              : 
   19350         7866 :     case E_V4SFmode:
   19351         7866 :       use_vec_extr = TARGET_SSE4_1;
   19352         7866 :       if (use_vec_extr)
   19353              :         break;
   19354              : 
   19355         4026 :       switch (elt)
   19356              :         {
   19357              :         case 0:
   19358              :           tmp = vec;
   19359              :           break;
   19360              : 
   19361         1669 :         case 1:
   19362         1669 :         case 3:
   19363         1669 :           tmp = gen_reg_rtx (mode);
   19364         1669 :           emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
   19365              :                                        GEN_INT (elt), GEN_INT (elt),
   19366         1669 :                                        GEN_INT (elt+4), GEN_INT (elt+4)));
   19367         1669 :           break;
   19368              : 
   19369          929 :         case 2:
   19370          929 :           tmp = gen_reg_rtx (mode);
   19371          929 :           emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
   19372          929 :           break;
   19373              : 
   19374            0 :         default:
   19375            0 :           gcc_unreachable ();
   19376              :         }
   19377              :       vec = tmp;
   19378              :       use_vec_extr = true;
   19379              :       elt = 0;
   19380              :       break;
   19381              : 
   19382        22728 :     case E_V4SImode:
   19383        22728 :       use_vec_extr = TARGET_SSE4_1;
   19384        22728 :       if (use_vec_extr)
   19385              :         break;
   19386              : 
   19387        16906 :       if (TARGET_SSE2)
   19388              :         {
   19389        16902 :           switch (elt)
   19390              :             {
   19391              :             case 0:
   19392              :               tmp = vec;
   19393              :               break;
   19394              : 
   19395         5432 :             case 1:
   19396         5432 :             case 3:
   19397         5432 :               tmp = gen_reg_rtx (mode);
   19398         5432 :               emit_insn (gen_sse2_pshufd_1 (tmp, vec,
   19399              :                                             GEN_INT (elt), GEN_INT (elt),
   19400              :                                             GEN_INT (elt), GEN_INT (elt)));
   19401         5432 :               break;
   19402              : 
   19403         2663 :             case 2:
   19404         2663 :               tmp = gen_reg_rtx (mode);
   19405         2663 :               emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
   19406         2663 :               break;
   19407              : 
   19408            0 :             default:
   19409            0 :               gcc_unreachable ();
   19410              :             }
   19411              :           vec = tmp;
   19412              :           use_vec_extr = true;
   19413              :           elt = 0;
   19414              :         }
   19415              :       else
   19416              :         {
   19417              :           /* For SSE1, we have to reuse the V4SF code.  */
   19418            4 :           ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
   19419            4 :                                       gen_lowpart (V4SFmode, vec), elt);
   19420            4 :           return;
   19421              :         }
   19422              :       break;
   19423              : 
   19424         6218 :     case E_V8HImode:
   19425         6218 :     case E_V8HFmode:
   19426         6218 :     case E_V8BFmode:
   19427         6218 :     case E_V2HImode:
   19428         6218 :     case E_V2HFmode:
   19429         6218 :     case E_V2BFmode:
   19430         6218 :       use_vec_extr = TARGET_SSE2;
   19431         6218 :       break;
   19432          876 :     case E_V4HImode:
   19433          876 :     case E_V4HFmode:
   19434          876 :     case E_V4BFmode:
   19435          876 :       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
   19436              :       break;
   19437              : 
   19438         7431 :     case E_V16QImode:
   19439         7431 :       use_vec_extr = TARGET_SSE4_1;
   19440         7431 :       if (!use_vec_extr
   19441         5637 :           && TARGET_SSE2
   19442         5637 :           && elt == 0
   19443        11256 :           && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
   19444              :         {
   19445         3824 :           tmp = gen_reg_rtx (SImode);
   19446         3824 :           ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
   19447              :                                       0);
   19448         3824 :           emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
   19449         3824 :           return;
   19450              :         }
   19451              :       break;
   19452           78 :     case E_V4QImode:
   19453           78 :       use_vec_extr = TARGET_SSE4_1;
   19454           78 :       break;
   19455              : 
   19456          604 :     case E_V8SFmode:
   19457          604 :       if (TARGET_AVX)
   19458              :         {
   19459          604 :           tmp = gen_reg_rtx (V4SFmode);
   19460          604 :           if (elt < 4)
   19461          298 :             emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
   19462              :           else
   19463          306 :             emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
   19464          604 :           ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19465          604 :           return;
   19466              :         }
   19467              :       break;
   19468              : 
   19469          565 :     case E_V4DFmode:
   19470          565 :       if (TARGET_AVX)
   19471              :         {
   19472          565 :           tmp = gen_reg_rtx (V2DFmode);
   19473          565 :           if (elt < 2)
   19474          297 :             emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
   19475              :           else
   19476          268 :             emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
   19477          565 :           ix86_expand_vector_extract (false, target, tmp, elt & 1);
   19478          565 :           return;
   19479              :         }
   19480              :       break;
   19481              : 
   19482          253 :     case E_V32QImode:
   19483          253 :       if (TARGET_AVX)
   19484              :         {
   19485          253 :           tmp = gen_reg_rtx (V16QImode);
   19486          253 :           if (elt < 16)
   19487          130 :             emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
   19488              :           else
   19489          123 :             emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
   19490          253 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19491          253 :           return;
   19492              :         }
   19493              :       break;
   19494              : 
   19495          616 :     case E_V16HImode:
   19496          616 :       if (TARGET_AVX)
   19497              :         {
   19498          616 :           tmp = gen_reg_rtx (V8HImode);
   19499          616 :           if (elt < 8)
   19500          304 :             emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
   19501              :           else
   19502          312 :             emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
   19503          616 :           ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19504          616 :           return;
   19505              :         }
   19506              :       break;
   19507              : 
   19508         1093 :     case E_V8SImode:
   19509         1093 :       if (TARGET_AVX)
   19510              :         {
   19511         1093 :           tmp = gen_reg_rtx (V4SImode);
   19512         1093 :           if (elt < 4)
   19513          527 :             emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
   19514              :           else
   19515          566 :             emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
   19516         1093 :           ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19517         1093 :           return;
   19518              :         }
   19519              :       break;
   19520              : 
   19521         1558 :     case E_V4DImode:
   19522         1558 :       if (TARGET_AVX)
   19523              :         {
   19524         1558 :           tmp = gen_reg_rtx (V2DImode);
   19525         1558 :           if (elt < 2)
   19526          833 :             emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
   19527              :           else
   19528          725 :             emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
   19529         1558 :           ix86_expand_vector_extract (false, target, tmp, elt & 1);
   19530         1558 :           return;
   19531              :         }
   19532              :       break;
   19533              : 
   19534            8 :     case E_V32HImode:
   19535            8 :       if (TARGET_AVX512BW)
   19536              :         {
   19537            8 :           tmp = gen_reg_rtx (V16HImode);
   19538            8 :           if (elt < 16)
   19539            3 :             emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
   19540              :           else
   19541            5 :             emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
   19542            8 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19543            8 :           return;
   19544              :         }
   19545              :       break;
   19546              : 
   19547           11 :     case E_V64QImode:
   19548           11 :       if (TARGET_AVX512BW)
   19549              :         {
   19550           11 :           tmp = gen_reg_rtx (V32QImode);
   19551           11 :           if (elt < 32)
   19552            5 :             emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
   19553              :           else
   19554            6 :             emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
   19555           11 :           ix86_expand_vector_extract (false, target, tmp, elt & 31);
   19556           11 :           return;
   19557              :         }
   19558              :       break;
   19559              : 
   19560          311 :     case E_V16SFmode:
   19561          311 :       tmp = gen_reg_rtx (V8SFmode);
   19562          311 :       if (elt < 8)
   19563          157 :         emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
   19564              :       else
   19565          154 :         emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
   19566          311 :       ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19567          311 :       return;
   19568              : 
   19569          296 :     case E_V8DFmode:
   19570          296 :       tmp = gen_reg_rtx (V4DFmode);
   19571          296 :       if (elt < 4)
   19572          160 :         emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
   19573              :       else
   19574          136 :         emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
   19575          296 :       ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19576          296 :       return;
   19577              : 
   19578          332 :     case E_V16SImode:
   19579          332 :       tmp = gen_reg_rtx (V8SImode);
   19580          332 :       if (elt < 8)
   19581          163 :         emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
   19582              :       else
   19583          169 :         emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
   19584          332 :       ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19585          332 :       return;
   19586              : 
   19587          738 :     case E_V8DImode:
   19588          738 :       tmp = gen_reg_rtx (V4DImode);
   19589          738 :       if (elt < 4)
   19590          419 :         emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
   19591              :       else
   19592          319 :         emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
   19593          738 :       ix86_expand_vector_extract (false, target, tmp, elt & 3);
   19594          738 :       return;
   19595              : 
   19596           45 :     case E_V32HFmode:
   19597           45 :     case E_V32BFmode:
   19598           45 :       if (TARGET_AVX512BW)
   19599              :         {
   19600           45 :           tmp = (mode == E_V32HFmode
   19601           45 :                  ? gen_reg_rtx (V16HFmode)
   19602            7 :                  : gen_reg_rtx (V16BFmode));
   19603           45 :           if (elt < 16)
   19604           31 :             emit_insn (gen_vec_extract_lo (mode, tmp, vec));
   19605              :           else
   19606           14 :             emit_insn (gen_vec_extract_hi (mode, tmp, vec));
   19607           45 :           ix86_expand_vector_extract (false, target, tmp, elt & 15);
   19608           45 :           return;
   19609              :         }
   19610              :       break;
   19611              : 
   19612          474 :     case E_V16HFmode:
   19613          474 :     case E_V16BFmode:
   19614          474 :       if (TARGET_AVX)
   19615              :         {
   19616          474 :           tmp = (mode == E_V16HFmode
   19617          474 :                  ? gen_reg_rtx (V8HFmode)
   19618          339 :                  : gen_reg_rtx (V8BFmode));
   19619          474 :           if (elt < 8)
   19620          249 :             emit_insn (gen_vec_extract_lo (mode, tmp, vec));
   19621              :           else
   19622          225 :             emit_insn (gen_vec_extract_hi (mode, tmp, vec));
   19623          474 :           ix86_expand_vector_extract (false, target, tmp, elt & 7);
   19624          474 :           return;
   19625              :         }
   19626              :       break;
   19627              : 
   19628          627 :     case E_V8QImode:
   19629          627 :       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   19630              :       /* ??? Could extract the appropriate HImode element and shift.  */
   19631              :       break;
   19632              : 
   19633              :     default:
   19634              :       break;
   19635              :     }
   19636              : 
   19637        26122 :   if (use_vec_extr)
   19638              :     {
   19639        86565 :       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
   19640        86565 :       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
   19641              : 
   19642              :       /* Let the rtl optimizers know about the zero extension performed.  */
   19643        86565 :       if (inner_mode == QImode || inner_mode == HImode)
   19644              :         {
   19645         8417 :           rtx reg = gen_reg_rtx (SImode);
   19646         8417 :           tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
   19647         8417 :           emit_move_insn (reg, tmp);
   19648         8417 :           tmp = gen_lowpart (inner_mode, reg);
   19649         8417 :           SUBREG_PROMOTED_VAR_P (tmp) = 1;
   19650         8417 :           SUBREG_PROMOTED_SET (tmp, 1);
   19651              :         }
   19652              : 
   19653        86565 :       emit_move_insn (target, tmp);
   19654              :     }
   19655              :   else
   19656              :     {
   19657        14440 :       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   19658              : 
   19659         7220 :       emit_move_insn (mem, vec);
   19660              : 
   19661        14440 :       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
   19662         7220 :       emit_move_insn (target, tmp);
   19663              :     }
   19664              : }
   19665              : 
   19666              : /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
   19667              :    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
   19668              :    The upper bits of DEST are undefined, though they shouldn't cause
   19669              :    exceptions (some bits from src or all zeros are ok).  */
   19670              : 
   19671              : static void
   19672        41858 : emit_reduc_half (rtx dest, rtx src, int i)
   19673              : {
   19674        41858 :   rtx tem, d = dest;
   19675        41858 :   switch (GET_MODE (src))
   19676              :     {
   19677         6042 :     case E_V4SFmode:
   19678         6042 :       if (i == 128)
   19679         3021 :         tem = gen_sse_movhlps (dest, src, src);
   19680              :       else
   19681         3021 :         tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
   19682              :                                    GEN_INT (1 + 4), GEN_INT (1 + 4));
   19683              :       break;
   19684         3362 :     case E_V2DFmode:
   19685         3362 :       tem = gen_vec_interleave_highv2df (dest, src, src);
   19686         3362 :       break;
   19687           76 :     case E_V4QImode:
   19688           76 :       d = gen_reg_rtx (V1SImode);
   19689           76 :       tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
   19690           76 :                                GEN_INT (i / 2));
   19691           76 :       break;
   19692          615 :     case E_V8QImode:
   19693          615 :     case E_V4HImode:
   19694          615 :       d = gen_reg_rtx (V1DImode);
   19695          615 :       tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
   19696          615 :                                GEN_INT (i / 2));
   19697          615 :       break;
   19698        31763 :     case E_V16QImode:
   19699        31763 :     case E_V8HImode:
   19700        31763 :     case E_V8HFmode:
   19701        31763 :     case E_V4SImode:
   19702        31763 :     case E_V2DImode:
   19703        31763 :       if (TARGET_SSE_REDUCTION_PREFER_PSHUF)
   19704              :         {
   19705           19 :           if (i == 128)
   19706              :             {
   19707           13 :               d = gen_reg_rtx (V4SImode);
   19708           26 :               tem = gen_sse2_pshufd_1 (
   19709           13 :                   d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
   19710              :                   GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3));
   19711           13 :               break;
   19712              :             }
   19713            6 :           else if (i == 64)
   19714              :             {
   19715            5 :               d = gen_reg_rtx (V4SImode);
   19716           10 :               tem = gen_sse2_pshufd_1 (
   19717            5 :                   d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
   19718              :                   GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
   19719            5 :               break;
   19720              :             }
   19721            1 :           else if (i == 32)
   19722              :             {
   19723            1 :               d = gen_reg_rtx (V8HImode);
   19724            2 :               tem = gen_sse2_pshuflw_1 (
   19725            1 :                   d, force_reg (V8HImode, gen_lowpart (V8HImode, src)),
   19726              :                   GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
   19727            1 :               break;
   19728              :             }
   19729              :         }
   19730        31744 :       d = gen_reg_rtx (V1TImode);
   19731        31744 :       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
   19732        31744 :                                 GEN_INT (i / 2));
   19733        31744 :       break;
   19734            0 :     case E_V8SFmode:
   19735            0 :       if (i == 256)
   19736            0 :         tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
   19737              :       else
   19738            0 :         tem = gen_avx_shufps256 (dest, src, src,
   19739              :                                  GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
   19740              :       break;
   19741            0 :     case E_V4DFmode:
   19742            0 :       if (i == 256)
   19743            0 :         tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
   19744              :       else
   19745            0 :         tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
   19746              :       break;
   19747            0 :     case E_V32QImode:
   19748            0 :     case E_V16HImode:
   19749            0 :     case E_V16HFmode:
   19750            0 :     case E_V8SImode:
   19751            0 :     case E_V4DImode:
   19752            0 :       if (i == 256)
   19753              :         {
   19754            0 :           if (GET_MODE (dest) != V4DImode)
   19755            0 :             d = gen_reg_rtx (V4DImode);
   19756            0 :           tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
   19757            0 :                                    gen_lowpart (V4DImode, src),
   19758              :                                    const1_rtx);
   19759              :         }
   19760              :       else
   19761              :         {
   19762            0 :           d = gen_reg_rtx (V2TImode);
   19763            0 :           tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
   19764            0 :                                     GEN_INT (i / 2));
   19765              :         }
   19766              :       break;
   19767            0 :     case E_V64QImode:
   19768            0 :     case E_V32HImode:
   19769            0 :     case E_V32HFmode:
   19770            0 :       if (i < 64)
   19771              :         {
   19772            0 :           d = gen_reg_rtx (V4TImode);
   19773            0 :           tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
   19774            0 :                                         GEN_INT (i / 2));
   19775            0 :           break;
   19776              :         }
   19777              :       /* FALLTHRU */
   19778            0 :     case E_V16SImode:
   19779            0 :     case E_V16SFmode:
   19780            0 :     case E_V8DImode:
   19781            0 :     case E_V8DFmode:
   19782            0 :       if (i > 128)
   19783            0 :         tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
   19784            0 :                                         gen_lowpart (V16SImode, src),
   19785            0 :                                         gen_lowpart (V16SImode, src),
   19786              :                                         GEN_INT (0x4 + (i == 512 ? 4 : 0)),
   19787              :                                         GEN_INT (0x5 + (i == 512 ? 4 : 0)),
   19788              :                                         GEN_INT (0x6 + (i == 512 ? 4 : 0)),
   19789              :                                         GEN_INT (0x7 + (i == 512 ? 4 : 0)),
   19790              :                                         GEN_INT (0xC), GEN_INT (0xD),
   19791              :                                         GEN_INT (0xE), GEN_INT (0xF),
   19792              :                                         GEN_INT (0x10), GEN_INT (0x11),
   19793              :                                         GEN_INT (0x12), GEN_INT (0x13),
   19794              :                                         GEN_INT (0x14), GEN_INT (0x15),
   19795              :                                         GEN_INT (0x16), GEN_INT (0x17));
   19796              :       else
   19797            0 :         tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
   19798            0 :                                     gen_lowpart (V16SImode, src),
   19799              :                                     GEN_INT (i == 128 ? 0x2 : 0x1),
   19800              :                                     GEN_INT (0x3),
   19801              :                                     GEN_INT (0x3),
   19802              :                                     GEN_INT (0x3),
   19803              :                                     GEN_INT (i == 128 ? 0x6 : 0x5),
   19804              :                                     GEN_INT (0x7),
   19805              :                                     GEN_INT (0x7),
   19806              :                                     GEN_INT (0x7),
   19807              :                                     GEN_INT (i == 128 ? 0xA : 0x9),
   19808              :                                     GEN_INT (0xB),
   19809              :                                     GEN_INT (0xB),
   19810              :                                     GEN_INT (0xB),
   19811              :                                     GEN_INT (i == 128 ? 0xE : 0xD),
   19812              :                                     GEN_INT (0xF),
   19813              :                                     GEN_INT (0xF),
   19814              :                                     GEN_INT (0xF));
   19815              :       break;
   19816            0 :     default:
   19817            0 :       gcc_unreachable ();
   19818              :     }
   19819        41858 :   emit_insn (tem);
   19820        41858 :   if (d != dest)
   19821        32454 :     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
   19822        41858 : }
   19823              : 
   19824              : /* Expand a vector reduction.  FN is the binary pattern to reduce;
   19825              :    DEST is the destination; IN is the input vector.  */
   19826              : 
   19827              : void
   19828        20886 : ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
   19829              : {
   19830        20886 :   rtx half, dst, vec = in;
   19831        20886 :   machine_mode mode = GET_MODE (in);
   19832        20886 :   int i;
   19833              : 
   19834              :   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
   19835        20886 :   if (TARGET_SSE4_1
   19836         9943 :       && mode == V8HImode
   19837          780 :       && fn == gen_uminv8hi3)
   19838              :     {
   19839            4 :       emit_insn (gen_sse4_1_phminposuw (dest, in));
   19840            4 :       return;
   19841              :     }
   19842              : 
   19843        41764 :   for (i = GET_MODE_BITSIZE (mode);
   19844       125480 :        i > GET_MODE_UNIT_BITSIZE (mode);
   19845        41858 :        i >>= 1)
   19846              :     {
   19847        41858 :       half = gen_reg_rtx (mode);
   19848        41858 :       emit_reduc_half (half, vec, i);
   19849        83716 :       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
   19850              :         dst = dest;
   19851              :       else
   19852        20976 :         dst = gen_reg_rtx (mode);
   19853        41858 :       emit_insn (fn (dst, half, vec));
   19854        41858 :       vec = dst;
   19855              :     }
   19856              : }
   19857              : 
   19858              : /* Output code to perform a conditional jump to LABEL, if C2 flag in
   19859              :    FP status register is set.  */
   19860              : 
   19861              : void
   19862          284 : ix86_emit_fp_unordered_jump (rtx label)
   19863              : {
   19864          284 :   rtx reg = gen_reg_rtx (HImode);
   19865          284 :   rtx_insn *insn;
   19866          284 :   rtx temp;
   19867              : 
   19868          284 :   emit_insn (gen_x86_fnstsw_1 (reg));
   19869              : 
   19870          284 :   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
   19871              :     {
   19872           37 :       emit_insn (gen_x86_sahf_1 (reg));
   19873              : 
   19874           37 :       temp = gen_rtx_REG (CCmode, FLAGS_REG);
   19875           37 :       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
   19876              :     }
   19877              :   else
   19878              :     {
   19879          247 :       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
   19880              : 
   19881          247 :       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   19882          247 :       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
   19883              :     }
   19884              : 
   19885          284 :   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
   19886              :                               gen_rtx_LABEL_REF (VOIDmode, label),
   19887              :                               pc_rtx);
   19888          284 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
   19889          284 :   predict_jump (REG_BR_PROB_BASE * 10 / 100);
   19890          284 :   JUMP_LABEL (insn) = label;
   19891          284 : }
   19892              : 
   19893              : /* Output code to perform an sinh XFmode calculation.  */
   19894              : 
   19895              : void
   19896            2 : ix86_emit_i387_sinh (rtx op0, rtx op1)
   19897              : {
   19898            2 :   rtx e1 = gen_reg_rtx (XFmode);
   19899            2 :   rtx e2 = gen_reg_rtx (XFmode);
   19900            2 :   rtx scratch = gen_reg_rtx (HImode);
   19901            2 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   19902            2 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   19903            2 :   rtx cst1, tmp;
   19904            2 :   rtx_code_label *jump_label = gen_label_rtx ();
   19905            2 :   rtx_insn *insn;
   19906              : 
   19907              :   /* scratch = fxam (op1) */
   19908            2 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   19909              : 
   19910              :   /* e1 = expm1 (|op1|) */
   19911            2 :   emit_insn (gen_absxf2 (e2, op1));
   19912            2 :   emit_insn (gen_expm1xf2 (e1, e2));
   19913              : 
   19914              :   /* e2 = e1 / (e1 + 1.0) + e1 */
   19915            2 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   19916            2 :   emit_insn (gen_addxf3 (e2, e1, cst1));
   19917            2 :   emit_insn (gen_divxf3 (e2, e1, e2));
   19918            2 :   emit_insn (gen_addxf3 (e2, e2, e1));
   19919              : 
   19920              :   /* flags = signbit (op1) */
   19921            2 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   19922              : 
   19923              :   /* if (flags) then e2 = -e2 */
   19924            2 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   19925              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   19926              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   19927              :                               pc_rtx);
   19928            2 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   19929            2 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   19930            2 :   JUMP_LABEL (insn) = jump_label;
   19931              : 
   19932            2 :   emit_insn (gen_negxf2 (e2, e2));
   19933              : 
   19934            2 :   emit_label (jump_label);
   19935            2 :   LABEL_NUSES (jump_label) = 1;
   19936              : 
   19937              :   /* op0 = 0.5 * e2 */
   19938            2 :   half = force_reg (XFmode, half);
   19939            2 :   emit_insn (gen_mulxf3 (op0, e2, half));
   19940            2 : }
   19941              : 
   19942              : /* Output code to perform an cosh XFmode calculation.  */
   19943              : 
   19944              : void
   19945            3 : ix86_emit_i387_cosh (rtx op0, rtx op1)
   19946              : {
   19947            3 :   rtx e1 = gen_reg_rtx (XFmode);
   19948            3 :   rtx e2 = gen_reg_rtx (XFmode);
   19949            3 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   19950            3 :   rtx cst1;
   19951              : 
   19952              :   /* e1 = exp (op1) */
   19953            3 :   emit_insn (gen_expxf2 (e1, op1));
   19954              : 
   19955              :   /* e2 = e1 + 1.0 / e1 */
   19956            3 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   19957            3 :   emit_insn (gen_divxf3 (e2, cst1, e1));
   19958            3 :   emit_insn (gen_addxf3 (e2, e1, e2));
   19959              : 
   19960              :   /* op0 = 0.5 * e2 */
   19961            3 :   half = force_reg (XFmode, half);
   19962            3 :   emit_insn (gen_mulxf3 (op0, e2, half));
   19963            3 : }
   19964              : 
   19965              : /* Output code to perform an tanh XFmode calculation.  */
   19966              : 
   19967              : void
   19968            1 : ix86_emit_i387_tanh (rtx op0, rtx op1)
   19969              : {
   19970            1 :   rtx e1 = gen_reg_rtx (XFmode);
   19971            1 :   rtx e2 = gen_reg_rtx (XFmode);
   19972            1 :   rtx scratch = gen_reg_rtx (HImode);
   19973            1 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   19974            1 :   rtx cst2, tmp;
   19975            1 :   rtx_code_label *jump_label = gen_label_rtx ();
   19976            1 :   rtx_insn *insn;
   19977              : 
   19978              :   /* scratch = fxam (op1) */
   19979            1 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   19980              : 
   19981              :   /* e1 = expm1 (-|2 * op1|) */
   19982            1 :   emit_insn (gen_addxf3 (e2, op1, op1));
   19983            1 :   emit_insn (gen_absxf2 (e2, e2));
   19984            1 :   emit_insn (gen_negxf2 (e2, e2));
   19985            1 :   emit_insn (gen_expm1xf2 (e1, e2));
   19986              : 
   19987              :   /* e2 = e1 / (e1 + 2.0) */
   19988            1 :   cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
   19989            1 :   emit_insn (gen_addxf3 (e2, e1, cst2));
   19990            1 :   emit_insn (gen_divxf3 (e2, e1, e2));
   19991              : 
   19992              :   /* flags = signbit (op1) */
   19993            1 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   19994              : 
   19995              :   /* if (!flags) then e2 = -e2 */
   19996            1 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   19997              :                               gen_rtx_NE (VOIDmode, flags, const0_rtx),
   19998              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   19999              :                               pc_rtx);
   20000            1 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20001            1 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20002            1 :   JUMP_LABEL (insn) = jump_label;
   20003              : 
   20004            1 :   emit_insn (gen_negxf2 (e2, e2));
   20005              : 
   20006            1 :   emit_label (jump_label);
   20007            1 :   LABEL_NUSES (jump_label) = 1;
   20008              : 
   20009            1 :   emit_move_insn (op0, e2);
   20010            1 : }
   20011              : 
   20012              : /* Output code to perform an asinh XFmode calculation.  */
   20013              : 
   20014              : void
   20015            0 : ix86_emit_i387_asinh (rtx op0, rtx op1)
   20016              : {
   20017            0 :   rtx e1 = gen_reg_rtx (XFmode);
   20018            0 :   rtx e2 = gen_reg_rtx (XFmode);
   20019            0 :   rtx scratch = gen_reg_rtx (HImode);
   20020            0 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20021            0 :   rtx cst1, tmp;
   20022            0 :   rtx_code_label *jump_label = gen_label_rtx ();
   20023            0 :   rtx_insn *insn;
   20024              : 
   20025              :   /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
   20026            0 :   emit_insn (gen_mulxf3 (e1, op1, op1));
   20027            0 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20028            0 :   emit_insn (gen_addxf3 (e2, e1, cst1));
   20029            0 :   emit_insn (gen_sqrtxf2 (e2, e2));
   20030            0 :   emit_insn (gen_addxf3 (e2, e2, cst1));
   20031              : 
   20032              :   /* e1 = e1 / e2 */
   20033            0 :   emit_insn (gen_divxf3 (e1, e1, e2));
   20034              : 
   20035              :   /* scratch = fxam (op1) */
   20036            0 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20037              : 
   20038              :   /* e1 = e1 + |op1| */
   20039            0 :   emit_insn (gen_absxf2 (e2, op1));
   20040            0 :   emit_insn (gen_addxf3 (e1, e1, e2));
   20041              : 
   20042              :   /* e2 = log1p (e1) */
   20043            0 :   ix86_emit_i387_log1p (e2, e1);
   20044              : 
   20045              :   /* flags = signbit (op1) */
   20046            0 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20047              : 
   20048              :   /* if (flags) then e2 = -e2 */
   20049            0 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20050              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   20051              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20052              :                               pc_rtx);
   20053            0 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20054            0 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20055            0 :   JUMP_LABEL (insn) = jump_label;
   20056              : 
   20057            0 :   emit_insn (gen_negxf2 (e2, e2));
   20058              : 
   20059            0 :   emit_label (jump_label);
   20060            0 :   LABEL_NUSES (jump_label) = 1;
   20061              : 
   20062            0 :   emit_move_insn (op0, e2);
   20063            0 : }
   20064              : 
   20065              : /* Output code to perform an acosh XFmode calculation.  */
   20066              : 
   20067              : void
   20068            0 : ix86_emit_i387_acosh (rtx op0, rtx op1)
   20069              : {
   20070            0 :   rtx e1 = gen_reg_rtx (XFmode);
   20071            0 :   rtx e2 = gen_reg_rtx (XFmode);
   20072            0 :   rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20073              : 
   20074              :   /* e2 = sqrt (op1 + 1.0) */
   20075            0 :   emit_insn (gen_addxf3 (e2, op1, cst1));
   20076            0 :   emit_insn (gen_sqrtxf2 (e2, e2));
   20077              : 
   20078              :   /* e1 = sqrt (op1 - 1.0) */
   20079            0 :   emit_insn (gen_subxf3 (e1, op1, cst1));
   20080            0 :   emit_insn (gen_sqrtxf2 (e1, e1));
   20081              : 
   20082              :   /* e1 = e1 * e2 */
   20083            0 :   emit_insn (gen_mulxf3 (e1, e1, e2));
   20084              : 
   20085              :   /* e1 = e1 + op1 */
   20086            0 :   emit_insn (gen_addxf3 (e1, e1, op1));
   20087              : 
   20088              :   /* op0 = log (e1) */
   20089            0 :   emit_insn (gen_logxf2 (op0, e1));
   20090            0 : }
   20091              : 
   20092              : /* Output code to perform an atanh XFmode calculation.  */
   20093              : 
   20094              : void
   20095            4 : ix86_emit_i387_atanh (rtx op0, rtx op1)
   20096              : {
   20097            4 :   rtx e1 = gen_reg_rtx (XFmode);
   20098            4 :   rtx e2 = gen_reg_rtx (XFmode);
   20099            4 :   rtx scratch = gen_reg_rtx (HImode);
   20100            4 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20101            4 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   20102            4 :   rtx cst1, tmp;
   20103            4 :   rtx_code_label *jump_label = gen_label_rtx ();
   20104            4 :   rtx_insn *insn;
   20105              : 
   20106              :   /* scratch = fxam (op1) */
   20107            4 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20108              : 
   20109              :   /* e2 = |op1| */
   20110            4 :   emit_insn (gen_absxf2 (e2, op1));
   20111              : 
   20112              :   /* e1 = -(e2 + e2) / (e2 + 1.0) */
   20113            4 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20114            4 :   emit_insn (gen_addxf3 (e1, e2, cst1));
   20115            4 :   emit_insn (gen_addxf3 (e2, e2, e2));
   20116            4 :   emit_insn (gen_negxf2 (e2, e2));
   20117            4 :   emit_insn (gen_divxf3 (e1, e2, e1));
   20118              : 
   20119              :   /* e2 = log1p (e1) */
   20120            4 :   ix86_emit_i387_log1p (e2, e1);
   20121              : 
   20122              :   /* flags = signbit (op1) */
   20123            4 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20124              : 
   20125              :   /* if (!flags) then e2 = -e2 */
   20126            4 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20127              :                               gen_rtx_NE (VOIDmode, flags, const0_rtx),
   20128              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20129              :                               pc_rtx);
   20130            4 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20131            4 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20132            4 :   JUMP_LABEL (insn) = jump_label;
   20133              : 
   20134            4 :   emit_insn (gen_negxf2 (e2, e2));
   20135              : 
   20136            4 :   emit_label (jump_label);
   20137            4 :   LABEL_NUSES (jump_label) = 1;
   20138              : 
   20139              :   /* op0 = 0.5 * e2 */
   20140            4 :   half = force_reg (XFmode, half);
   20141            4 :   emit_insn (gen_mulxf3 (op0, e2, half));
   20142            4 : }
   20143              : 
   20144              : /* Output code to perform a log1p XFmode calculation.  */
   20145              : 
   20146              : void
   20147            5 : ix86_emit_i387_log1p (rtx op0, rtx op1)
   20148              : {
   20149            5 :   rtx_code_label *label1 = gen_label_rtx ();
   20150            5 :   rtx_code_label *label2 = gen_label_rtx ();
   20151              : 
   20152            5 :   rtx tmp = gen_reg_rtx (XFmode);
   20153            5 :   rtx res = gen_reg_rtx (XFmode);
   20154            5 :   rtx cst, cstln2, cst1;
   20155            5 :   rtx_insn *insn;
   20156              : 
   20157              :   /* The emit_jump call emits pending stack adjust, make sure it is emitted
   20158              :      before the conditional jump, otherwise the stack adjustment will be
   20159              :      only conditional.  */
   20160            5 :   do_pending_stack_adjust ();
   20161              : 
   20162            5 :   cst = const_double_from_real_value
   20163            5 :     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
   20164            5 :   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
   20165              : 
   20166            5 :   emit_insn (gen_absxf2 (tmp, op1));
   20167              : 
   20168            5 :   cst = force_reg (XFmode, cst);
   20169            5 :   ix86_expand_branch (GE, tmp, cst, label1);
   20170            5 :   predict_jump (REG_BR_PROB_BASE * 10 / 100);
   20171            5 :   insn = get_last_insn ();
   20172            5 :   JUMP_LABEL (insn) = label1;
   20173              : 
   20174            5 :   emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
   20175            5 :   emit_jump (label2);
   20176              : 
   20177            5 :   emit_label (label1);
   20178            5 :   LABEL_NUSES (label1) = 1;
   20179              : 
   20180            5 :   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   20181            5 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
   20182            5 :   emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
   20183              : 
   20184            5 :   emit_label (label2);
   20185            5 :   LABEL_NUSES (label2) = 1;
   20186              : 
   20187            5 :   emit_move_insn (op0, res);
   20188            5 : }
   20189              : 
   20190              : /* Emit code for round calculation.  */
   20191              : void
   20192           68 : ix86_emit_i387_round (rtx op0, rtx op1)
   20193              : {
   20194           68 :   machine_mode inmode = GET_MODE (op1);
   20195           68 :   machine_mode outmode = GET_MODE (op0);
   20196           68 :   rtx e1 = gen_reg_rtx (XFmode);
   20197           68 :   rtx e2 = gen_reg_rtx (XFmode);
   20198           68 :   rtx scratch = gen_reg_rtx (HImode);
   20199           68 :   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   20200           68 :   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   20201           68 :   rtx res = gen_reg_rtx (outmode);
   20202           68 :   rtx_code_label *jump_label = gen_label_rtx ();
   20203           68 :   rtx (*floor_insn) (rtx, rtx);
   20204           68 :   rtx (*neg_insn) (rtx, rtx);
   20205           68 :   rtx_insn *insn;
   20206           68 :   rtx tmp;
   20207              : 
   20208           68 :   switch (inmode)
   20209              :     {
   20210           37 :     case E_SFmode:
   20211           37 :     case E_DFmode:
   20212           37 :       tmp = gen_reg_rtx (XFmode);
   20213              : 
   20214           37 :       emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
   20215           37 :       op1 = tmp;
   20216           37 :       break;
   20217              :     case E_XFmode:
   20218              :       break;
   20219            0 :     default:
   20220            0 :       gcc_unreachable ();
   20221              :     }
   20222              : 
   20223           68 :   switch (outmode)
   20224              :     {
   20225              :     case E_SFmode:
   20226              :       floor_insn = gen_frndintxf2_floor;
   20227              :       neg_insn = gen_negsf2;
   20228              :       break;
   20229            8 :     case E_DFmode:
   20230            8 :       floor_insn = gen_frndintxf2_floor;
   20231            8 :       neg_insn = gen_negdf2;
   20232            8 :       break;
   20233           10 :     case E_XFmode:
   20234           10 :       floor_insn = gen_frndintxf2_floor;
   20235           10 :       neg_insn = gen_negxf2;
   20236           10 :       break;
   20237            0 :     case E_HImode:
   20238            0 :       floor_insn = gen_lfloorxfhi2;
   20239            0 :       neg_insn = gen_neghi2;
   20240            0 :       break;
   20241           10 :     case E_SImode:
   20242           10 :       floor_insn = gen_lfloorxfsi2;
   20243           10 :       neg_insn = gen_negsi2;
   20244           10 :       break;
   20245           36 :     case E_DImode:
   20246           36 :       floor_insn = gen_lfloorxfdi2;
   20247           36 :       neg_insn = gen_negdi2;
   20248           36 :       break;
   20249            0 :     default:
   20250            0 :       gcc_unreachable ();
   20251              :     }
   20252              : 
   20253              :   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
   20254              : 
   20255              :   /* scratch = fxam(op1) */
   20256           68 :   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   20257              : 
   20258              :   /* e1 = fabs(op1) */
   20259           68 :   emit_insn (gen_absxf2 (e1, op1));
   20260              : 
   20261              :   /* e2 = e1 + 0.5 */
   20262           68 :   half = force_reg (XFmode, half);
   20263           68 :   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
   20264              : 
   20265              :   /* res = floor(e2) */
   20266           68 :   switch (outmode)
   20267              :     {
   20268           12 :     case E_SFmode:
   20269           12 :     case E_DFmode:
   20270           12 :       {
   20271           12 :         tmp = gen_reg_rtx (XFmode);
   20272              : 
   20273           12 :         emit_insn (floor_insn (tmp, e2));
   20274           12 :         emit_insn (gen_rtx_SET (res,
   20275              :                                 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
   20276              :                                                 UNSPEC_TRUNC_NOOP)));
   20277              :       }
   20278           12 :       break;
   20279           56 :     default:
   20280           56 :       emit_insn (floor_insn (res, e2));
   20281              :     }
   20282              : 
   20283              :   /* flags = signbit(a) */
   20284           68 :   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   20285              : 
   20286              :   /* if (flags) then res = -res */
   20287           68 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   20288              :                               gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   20289              :                               gen_rtx_LABEL_REF (VOIDmode, jump_label),
   20290              :                               pc_rtx);
   20291           68 :   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20292           68 :   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   20293           68 :   JUMP_LABEL (insn) = jump_label;
   20294              : 
   20295           68 :   emit_insn (neg_insn (res, res));
   20296              : 
   20297           68 :   emit_label (jump_label);
   20298           68 :   LABEL_NUSES (jump_label) = 1;
   20299              : 
   20300           68 :   emit_move_insn (op0, res);
   20301           68 : }
   20302              : 
   20303              : /* Output code to perform a Newton-Rhapson approximation of a single precision
   20304              :    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
   20305              : 
   20306              : void
   20307           55 : ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
   20308              : {
   20309           55 :   rtx x0, x1, e0, e1;
   20310              : 
   20311           55 :   x0 = gen_reg_rtx (mode);
   20312           55 :   e0 = gen_reg_rtx (mode);
   20313           55 :   e1 = gen_reg_rtx (mode);
   20314           55 :   x1 = gen_reg_rtx (mode);
   20315              : 
   20316           55 :   b = force_reg (mode, b);
   20317              : 
   20318              :   /* x0 = rcp(b) estimate */
   20319           55 :   if (mode == V16SFmode || mode == V8DFmode)
   20320              :     {
   20321            0 :       emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   20322              :                                                   UNSPEC_RCP14)));
   20323              :     }
   20324              :   else
   20325           55 :     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   20326              :                                                 UNSPEC_RCP)));
   20327              : 
   20328           55 :   unsigned vector_size = GET_MODE_SIZE (mode);
   20329              : 
   20330              :   /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
   20331              :      N-R step with 2 fma implementation.  */
   20332           55 :   if (TARGET_FMA
   20333           54 :       || (TARGET_AVX512F && vector_size == 64)
   20334           54 :       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
   20335              :     {
   20336              :       /* e0 = x0 * a  */
   20337            1 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
   20338              :       /* e1 = e0 * b - a  */
   20339            1 :       emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
   20340              :                                                gen_rtx_NEG (mode, a))));
   20341              :       /* res = - e1 * x0 + e0  */
   20342            1 :       emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
   20343              :                                                gen_rtx_NEG (mode, e1),
   20344              :                                                x0, e0)));
   20345              :     }
   20346              :   else
   20347              :     /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
   20348              :     {
   20349              :       /* e0 = x0 * b */
   20350           54 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
   20351              : 
   20352              :       /* e1 = x0 + x0 */
   20353           54 :       emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
   20354              : 
   20355              :       /* e0 = x0 * e0 */
   20356           54 :       emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
   20357              : 
   20358              :       /* x1 = e1 - e0 */
   20359           54 :       emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
   20360              : 
   20361              :       /* res = a * x1 */
   20362           54 :       emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
   20363              :     }
   20364           55 : }
   20365              : 
   20366              : /* Output code to perform a Newton-Rhapson approximation of a
   20367              :    single precision floating point [reciprocal] square root.  */
   20368              : 
   20369              : void
   20370           85 : ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
   20371              : {
   20372           85 :   rtx x0, e0, e1, e2, e3, mthree, mhalf;
   20373           85 :   REAL_VALUE_TYPE r;
   20374           85 :   int unspec;
   20375              : 
   20376           85 :   x0 = gen_reg_rtx (mode);
   20377           85 :   e0 = gen_reg_rtx (mode);
   20378           85 :   e1 = gen_reg_rtx (mode);
   20379           85 :   e2 = gen_reg_rtx (mode);
   20380           85 :   e3 = gen_reg_rtx (mode);
   20381              : 
   20382           85 :   real_from_integer (&r, VOIDmode, -3, SIGNED);
   20383           85 :   mthree = const_double_from_real_value (r, SFmode);
   20384              : 
   20385           85 :   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
   20386           85 :   mhalf = const_double_from_real_value (r, SFmode);
   20387           85 :   unspec = UNSPEC_RSQRT;
   20388              : 
   20389           85 :   if (VECTOR_MODE_P (mode))
   20390              :     {
   20391           66 :       mthree = ix86_build_const_vector (mode, true, mthree);
   20392           66 :       mhalf = ix86_build_const_vector (mode, true, mhalf);
   20393              :       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
   20394          132 :       if (GET_MODE_SIZE (mode) == 64)
   20395            0 :         unspec = UNSPEC_RSQRT14;
   20396              :     }
   20397              : 
   20398              :   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
   20399              :      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
   20400              : 
   20401           85 :   a = force_reg (mode, a);
   20402              : 
   20403              :   /* x0 = rsqrt(a) estimate */
   20404           85 :   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
   20405              :                                               unspec)));
   20406              : 
   20407              :   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
   20408           85 :   if (!recip)
   20409              :     {
   20410           57 :       rtx zero = force_reg (mode, CONST0_RTX(mode));
   20411           57 :       rtx mask;
   20412              : 
   20413              :       /* Handle masked compare.  */
   20414          110 :       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
   20415              :         {
   20416            0 :           mask = gen_reg_rtx (HImode);
   20417              :           /* Imm value 0x4 corresponds to not-equal comparison.  */
   20418            0 :           emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
   20419            0 :           emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
   20420              :         }
   20421              :       else
   20422              :         {
   20423           57 :           mask = gen_reg_rtx (mode);
   20424           57 :           emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
   20425           57 :           emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
   20426              :         }
   20427              :     }
   20428              : 
   20429           85 :   mthree = force_reg (mode, mthree);
   20430              : 
   20431              :   /* e0 = x0 * a */
   20432           85 :   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
   20433              : 
   20434           85 :   unsigned vector_size = GET_MODE_SIZE (mode);
   20435           85 :   if (TARGET_FMA
   20436           77 :       || (TARGET_AVX512F && vector_size == 64)
   20437           77 :       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
   20438           16 :     emit_insn (gen_rtx_SET (e2,
   20439              :                             gen_rtx_FMA (mode, e0, x0, mthree)));
   20440              :   else
   20441              :     {
   20442              :       /* e1 = e0 * x0 */
   20443           69 :       emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
   20444              : 
   20445              :       /* e2 = e1 - 3. */
   20446           69 :       emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
   20447              :     }
   20448              : 
   20449           85 :   mhalf = force_reg (mode, mhalf);
   20450           85 :   if (recip)
   20451              :     /* e3 = -.5 * x0 */
   20452           28 :     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
   20453              :   else
   20454              :     /* e3 = -.5 * e0 */
   20455           57 :     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
   20456              :   /* ret = e2 * e3 */
   20457           85 :   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
   20458           85 : }
   20459              : 
   20460              : /* Expand fabs (OP0) and return a new rtx that holds the result.  The
   20461              :    mask for masking out the sign-bit is stored in *SMASK, if that is
   20462              :    non-null.  */
   20463              : 
   20464              : static rtx
   20465         1049 : ix86_expand_sse_fabs (rtx op0, rtx *smask)
   20466              : {
   20467         1049 :   machine_mode vmode, mode = GET_MODE (op0);
   20468         1049 :   rtx xa, mask;
   20469              : 
   20470         1049 :   xa = gen_reg_rtx (mode);
   20471         1049 :   if (mode == SFmode)
   20472              :     vmode = V4SFmode;
   20473          467 :   else if (mode == DFmode)
   20474              :     vmode = V2DFmode;
   20475              :   else
   20476            0 :     vmode = mode;
   20477         1049 :   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
   20478         1049 :   if (!VECTOR_MODE_P (mode))
   20479              :     {
   20480              :       /* We need to generate a scalar mode mask in this case.  */
   20481         1049 :       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   20482         1049 :       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
   20483         1049 :       mask = gen_reg_rtx (mode);
   20484         1049 :       emit_insn (gen_rtx_SET (mask, tmp));
   20485              :     }
   20486         1049 :   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
   20487              : 
   20488         1049 :   if (smask)
   20489          996 :     *smask = mask;
   20490              : 
   20491         1049 :   return xa;
   20492              : }
   20493              : 
   20494              : /* Expands a comparison of OP0 with OP1 using comparison code CODE,
   20495              :    swapping the operands if SWAP_OPERANDS is true.  The expanded
   20496              :    code is a forward jump to a newly created label in case the
   20497              :    comparison is true.  The generated label rtx is returned.  */
   20498              : static rtx_code_label *
   20499         1064 : ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
   20500              :                                   bool swap_operands)
   20501              : {
   20502         1064 :   bool unordered_compare = ix86_unordered_fp_compare (code);
   20503         1064 :   rtx_code_label *label;
   20504         1064 :   rtx tmp, reg;
   20505              : 
   20506         1064 :   if (swap_operands)
   20507           34 :     std::swap (op0, op1);
   20508              : 
   20509         1064 :   label = gen_label_rtx ();
   20510         1064 :   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
   20511         1064 :   if (unordered_compare)
   20512          908 :     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
   20513         1064 :   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
   20514         1064 :   emit_insn (gen_rtx_SET (reg, tmp));
   20515         1064 :   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
   20516         1064 :   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   20517              :                               gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
   20518         1064 :   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   20519         1064 :   JUMP_LABEL (tmp) = label;
   20520              : 
   20521         1064 :   return label;
   20522              : }
   20523              : 
   20524              : /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
   20525              :    using comparison code CODE.  Operands are swapped for the comparison if
   20526              :    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
   20527              : static rtx
   20528          541 : ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
   20529              :                               bool swap_operands)
   20530              : {
   20531          541 :   rtx (*insn)(rtx, rtx, rtx, rtx);
   20532          541 :   machine_mode mode = GET_MODE (op0);
   20533          541 :   rtx mask = gen_reg_rtx (mode);
   20534              : 
   20535          541 :   if (swap_operands)
   20536          362 :     std::swap (op0, op1);
   20537              : 
   20538          541 :   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
   20539              : 
   20540          541 :   emit_insn (insn (mask, op0, op1,
   20541              :                    gen_rtx_fmt_ee (code, mode, op0, op1)));
   20542          541 :   return mask;
   20543              : }
   20544              : 
   20545              : /* Expand copysign from SIGN to the positive value ABS_VALUE
   20546              :    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
   20547              :    the sign-bit.  */
   20548              : 
   20549              : static void
   20550         1016 : ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
   20551              : {
   20552         1016 :   machine_mode mode = GET_MODE (sign);
   20553         1016 :   rtx sgn = gen_reg_rtx (mode);
   20554         1016 :   if (mask == NULL_RTX)
   20555              :     {
   20556           28 :       machine_mode vmode;
   20557              : 
   20558           28 :       if (mode == SFmode)
   20559              :         vmode = V4SFmode;
   20560              :       else if (mode == DFmode)
   20561              :         vmode = V2DFmode;
   20562              :       else if (mode == HFmode)
   20563              :         vmode = V8HFmode;
   20564              :       else
   20565           28 :         vmode = mode;
   20566              : 
   20567           28 :       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
   20568           28 :       if (!VECTOR_MODE_P (mode))
   20569              :         {
   20570              :           /* We need to generate a scalar mode mask in this case.  */
   20571           28 :           rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   20572           28 :           tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
   20573           28 :           mask = gen_reg_rtx (mode);
   20574           28 :           emit_insn (gen_rtx_SET (mask, tmp));
   20575              :         }
   20576              :     }
   20577              :   else
   20578          988 :     mask = gen_rtx_NOT (mode, mask);
   20579         1016 :   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
   20580         1016 :   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
   20581         1016 : }
   20582              : 
   20583              : /* Expand SSE sequence for computing lround from OP1 storing
   20584              :    into OP0.  */
   20585              : 
   20586              : void
   20587           28 : ix86_expand_lround (rtx op0, rtx op1)
   20588              : {
   20589              :   /* C code for the stuff we're doing below:
   20590              :         tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
   20591              :         return (long)tmp;
   20592              :    */
   20593           28 :   machine_mode mode = GET_MODE (op1);
   20594           28 :   const struct real_format *fmt;
   20595           28 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   20596           28 :   rtx adj;
   20597              : 
   20598              :   /* load nextafter (0.5, 0.0) */
   20599           28 :   fmt = REAL_MODE_FORMAT (mode);
   20600           28 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   20601           28 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   20602              : 
   20603              :   /* adj = copysign (0.5, op1) */
   20604           28 :   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
   20605           28 :   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
   20606              : 
   20607              :   /* adj = op1 + adj */
   20608           28 :   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
   20609              : 
   20610              :   /* op0 = (imode)adj */
   20611           28 :   expand_fix (op0, adj, 0);
   20612           28 : }
   20613              : 
   20614              : /* Expand SSE2 sequence for computing lround from OPERAND1 storing
   20615              :    into OPERAND0.  */
   20616              : 
   20617              : void
   20618           68 : ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
   20619              : {
   20620              :   /* C code for the stuff we're doing below (for do_floor):
   20621              :         xi = (long)op1;
   20622              :         xi -= (double)xi > op1 ? 1 : 0;
   20623              :         return xi;
   20624              :    */
   20625           68 :   machine_mode fmode = GET_MODE (op1);
   20626           68 :   machine_mode imode = GET_MODE (op0);
   20627           68 :   rtx ireg, freg, tmp;
   20628           68 :   rtx_code_label *label;
   20629              : 
   20630              :   /* reg = (long)op1 */
   20631           68 :   ireg = gen_reg_rtx (imode);
   20632           68 :   expand_fix (ireg, op1, 0);
   20633              : 
   20634              :   /* freg = (double)reg */
   20635           68 :   freg = gen_reg_rtx (fmode);
   20636           68 :   expand_float (freg, ireg, 0);
   20637              : 
   20638              :   /* ireg = (freg > op1) ? ireg - 1 : ireg */
   20639          136 :   label = ix86_expand_sse_compare_and_jump (UNLE,
   20640           68 :                                             freg, op1, !do_floor);
   20641          102 :   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
   20642              :                              ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
   20643           68 :   emit_move_insn (ireg, tmp);
   20644              : 
   20645           68 :   emit_label (label);
   20646           68 :   LABEL_NUSES (label) = 1;
   20647              : 
   20648           68 :   emit_move_insn (op0, ireg);
   20649           68 : }
   20650              : 
   20651              : /* Generate and return a rtx of mode MODE for 2**n where n is the number
   20652              :    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
   20653              : 
   20654              : static rtx
   20655          996 : ix86_gen_TWO52 (machine_mode mode)
   20656              : {
   20657          996 :   const struct real_format *fmt;
   20658          996 :   REAL_VALUE_TYPE TWO52r;
   20659          996 :   rtx TWO52;
   20660              : 
   20661          996 :   fmt = REAL_MODE_FORMAT (mode);
   20662          996 :   real_2expN (&TWO52r, fmt->p - 1, mode);
   20663          996 :   TWO52 = const_double_from_real_value (TWO52r, mode);
   20664          996 :   TWO52 = force_reg (mode, TWO52);
   20665              : 
   20666          996 :   return TWO52;
   20667              : }
   20668              : 
   20669              : /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
   20670              : 
   20671              : void
   20672          122 : ix86_expand_rint (rtx operand0, rtx operand1)
   20673              : {
   20674              :   /* C code for the stuff we're doing below:
   20675              :         xa = fabs (operand1);
   20676              :         if (!isless (xa, 2**52))
   20677              :           return operand1;
   20678              :         two52 = 2**52;
   20679              :         if (flag_rounding_math)
   20680              :           {
   20681              :             two52 = copysign (two52, operand1);
   20682              :             xa = operand1;
   20683              :           }
   20684              :         xa = xa + two52 - two52;
   20685              :         return copysign (xa, operand1);
   20686              :    */
   20687          122 :   machine_mode mode = GET_MODE (operand0);
   20688          122 :   rtx res, xa, TWO52, mask;
   20689          122 :   rtx_code_label *label;
   20690              : 
   20691          122 :   TWO52 = ix86_gen_TWO52 (mode);
   20692              : 
   20693              :   /* Temporary for holding the result, initialized to the input
   20694              :      operand to ease control flow.  */
   20695          122 :   res = copy_to_reg (operand1);
   20696              : 
   20697              :   /* xa = abs (operand1) */
   20698          122 :   xa = ix86_expand_sse_fabs (res, &mask);
   20699              : 
   20700              :   /* if (!isless (xa, TWO52)) goto label; */
   20701          122 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20702              : 
   20703          122 :   if (flag_rounding_math)
   20704              :     {
   20705           53 :       ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
   20706           53 :       xa = res;
   20707              :     }
   20708              : 
   20709          122 :   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   20710          122 :   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
   20711              : 
   20712              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20713          122 :   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
   20714           53 :     xa = ix86_expand_sse_fabs (xa, NULL);
   20715              : 
   20716          122 :   ix86_sse_copysign_to_positive (res, xa, res, mask);
   20717              : 
   20718          122 :   emit_label (label);
   20719          122 :   LABEL_NUSES (label) = 1;
   20720              : 
   20721          122 :   emit_move_insn (operand0, res);
   20722          122 : }
   20723              : 
   20724              : /* Expand SSE2 sequence for computing floor or ceil
   20725              :    from OPERAND1 storing into OPERAND0.  */
   20726              : void
   20727          541 : ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
   20728              : {
   20729              :   /* C code for the stuff we expand below.
   20730              :         double xa = fabs (x), x2;
   20731              :         if (!isless (xa, TWO52))
   20732              :           return x;
   20733              :         x2 = (double)(long)x;
   20734              : 
   20735              :      Compensate.  Floor:
   20736              :         if (x2 > x)
   20737              :           x2 -= 1;
   20738              :      Compensate.  Ceil:
   20739              :         if (x2 < x)
   20740              :           x2 += 1;
   20741              : 
   20742              :         if (HONOR_SIGNED_ZEROS (mode))
   20743              :           return copysign (x2, x);
   20744              :         return x2;
   20745              :    */
   20746          541 :   machine_mode mode = GET_MODE (operand0);
   20747          541 :   rtx xa, xi, TWO52, tmp, one, res, mask;
   20748          541 :   rtx_code_label *label;
   20749              : 
   20750          541 :   TWO52 = ix86_gen_TWO52 (mode);
   20751              : 
   20752              :   /* Temporary for holding the result, initialized to the input
   20753              :      operand to ease control flow.  */
   20754          541 :   res = copy_to_reg (operand1);
   20755              : 
   20756              :   /* xa = abs (operand1) */
   20757          541 :   xa = ix86_expand_sse_fabs (res, &mask);
   20758              : 
   20759              :   /* if (!isless (xa, TWO52)) goto label; */
   20760          541 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20761              : 
   20762              :   /* xa = (double)(long)x */
   20763          541 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   20764          541 :   expand_fix (xi, res, 0);
   20765          541 :   expand_float (xa, xi, 0);
   20766              : 
   20767              :   /* generate 1.0 */
   20768          541 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   20769              : 
   20770              :   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
   20771          541 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
   20772          541 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   20773          903 :   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
   20774              :                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   20775          541 :   if (HONOR_SIGNED_ZEROS (mode))
   20776              :     {
   20777              :       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20778          494 :       if (do_floor && flag_rounding_math)
   20779            0 :         tmp = ix86_expand_sse_fabs (tmp, NULL);
   20780              : 
   20781          494 :       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
   20782              :     }
   20783          541 :   emit_move_insn (res, tmp);
   20784              : 
   20785          541 :   emit_label (label);
   20786          541 :   LABEL_NUSES (label) = 1;
   20787              : 
   20788          541 :   emit_move_insn (operand0, res);
   20789          541 : }
   20790              : 
   20791              : /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
   20792              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   20793              :    that is only available on 64bit targets.  */
   20794              : void
   20795            0 : ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
   20796              : {
   20797              :   /* C code for the stuff we expand below.
   20798              :         double xa = fabs (x), x2;
   20799              :         if (!isless (xa, TWO52))
   20800              :           return x;
   20801              :         xa = xa + TWO52 - TWO52;
   20802              :         x2 = copysign (xa, x);
   20803              : 
   20804              :      Compensate.  Floor:
   20805              :         if (x2 > x)
   20806              :           x2 -= 1;
   20807              :      Compensate.  Ceil:
   20808              :         if (x2 < x)
   20809              :           x2 += 1;
   20810              : 
   20811              :         if (HONOR_SIGNED_ZEROS (mode))
   20812              :           x2 = copysign (x2, x);
   20813              :         return x2;
   20814              :    */
   20815            0 :   machine_mode mode = GET_MODE (operand0);
   20816            0 :   rtx xa, TWO52, tmp, one, res, mask;
   20817            0 :   rtx_code_label *label;
   20818              : 
   20819            0 :   TWO52 = ix86_gen_TWO52 (mode);
   20820              : 
   20821              :   /* Temporary for holding the result, initialized to the input
   20822              :      operand to ease control flow.  */
   20823            0 :   res = copy_to_reg (operand1);
   20824              : 
   20825              :   /* xa = abs (operand1) */
   20826            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   20827              : 
   20828              :   /* if (!isless (xa, TWO52)) goto label; */
   20829            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20830              : 
   20831              :   /* xa = xa + TWO52 - TWO52; */
   20832            0 :   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   20833            0 :   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
   20834              : 
   20835              :   /* xa = copysign (xa, operand1) */
   20836            0 :   ix86_sse_copysign_to_positive (xa, xa, res, mask);
   20837              : 
   20838              :   /* generate 1.0 */
   20839            0 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   20840              : 
   20841              :   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
   20842            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
   20843            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   20844            0 :   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
   20845              :                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   20846            0 :   if (HONOR_SIGNED_ZEROS (mode))
   20847              :     {
   20848              :       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20849            0 :       if (do_floor && flag_rounding_math)
   20850            0 :         tmp = ix86_expand_sse_fabs (tmp, NULL);
   20851              : 
   20852            0 :       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
   20853              :     }
   20854            0 :   emit_move_insn (res, tmp);
   20855              : 
   20856            0 :   emit_label (label);
   20857            0 :   LABEL_NUSES (label) = 1;
   20858              : 
   20859            0 :   emit_move_insn (operand0, res);
   20860            0 : }
   20861              : 
   20862              : /* Expand SSE sequence for computing trunc
   20863              :    from OPERAND1 storing into OPERAND0.  */
   20864              : void
   20865          319 : ix86_expand_trunc (rtx operand0, rtx operand1)
   20866              : {
   20867              :   /* C code for SSE variant we expand below.
   20868              :         double xa = fabs (x), x2;
   20869              :         if (!isless (xa, TWO52))
   20870              :           return x;
   20871              :         x2 = (double)(long)x;
   20872              :         if (HONOR_SIGNED_ZEROS (mode))
   20873              :           return copysign (x2, x);
   20874              :         return x2;
   20875              :    */
   20876          319 :   machine_mode mode = GET_MODE (operand0);
   20877          319 :   rtx xa, xi, TWO52, res, mask;
   20878          319 :   rtx_code_label *label;
   20879              : 
   20880          319 :   TWO52 = ix86_gen_TWO52 (mode);
   20881              : 
   20882              :   /* Temporary for holding the result, initialized to the input
   20883              :      operand to ease control flow.  */
   20884          319 :   res = copy_to_reg (operand1);
   20885              : 
   20886              :   /* xa = abs (operand1) */
   20887          319 :   xa = ix86_expand_sse_fabs (res, &mask);
   20888              : 
   20889              :   /* if (!isless (xa, TWO52)) goto label; */
   20890          319 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20891              : 
   20892              :   /* xa = (double)(long)x */
   20893          319 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   20894          319 :   expand_fix (xi, res, 0);
   20895          319 :   expand_float (xa, xi, 0);
   20896              : 
   20897          319 :   if (HONOR_SIGNED_ZEROS (mode))
   20898          305 :     ix86_sse_copysign_to_positive (xa, xa, res, mask);
   20899              : 
   20900          319 :   emit_move_insn (res, xa);
   20901              : 
   20902          319 :   emit_label (label);
   20903          319 :   LABEL_NUSES (label) = 1;
   20904              : 
   20905          319 :   emit_move_insn (operand0, res);
   20906          319 : }
   20907              : 
   20908              : /* Expand SSE sequence for computing trunc from OPERAND1 storing
   20909              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   20910              :    that is only available on 64bit targets.  */
   20911              : void
   20912            0 : ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
   20913              : {
   20914            0 :   machine_mode mode = GET_MODE (operand0);
   20915            0 :   rtx xa, xa2, TWO52, tmp, one, res, mask;
   20916            0 :   rtx_code_label *label;
   20917              : 
   20918              :   /* C code for SSE variant we expand below.
   20919              :         double xa = fabs (x), x2;
   20920              :         if (!isless (xa, TWO52))
   20921              :           return x;
   20922              :         xa2 = xa + TWO52 - TWO52;
   20923              :      Compensate:
   20924              :         if (xa2 > xa)
   20925              :           xa2 -= 1.0;
   20926              :         x2 = copysign (xa2, x);
   20927              :         return x2;
   20928              :    */
   20929              : 
   20930            0 :   TWO52 = ix86_gen_TWO52 (mode);
   20931              : 
   20932              :   /* Temporary for holding the result, initialized to the input
   20933              :      operand to ease control flow.  */
   20934            0 :   res =copy_to_reg (operand1);
   20935              : 
   20936              :   /* xa = abs (operand1) */
   20937            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   20938              : 
   20939              :   /* if (!isless (xa, TWO52)) goto label; */
   20940            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20941              : 
   20942              :   /* xa2 = xa + TWO52 - TWO52; */
   20943            0 :   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   20944            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
   20945              : 
   20946              :   /* generate 1.0 */
   20947            0 :   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   20948              : 
   20949              :   /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
   20950            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
   20951            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   20952            0 :   tmp = expand_simple_binop (mode, MINUS,
   20953              :                              xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   20954              :   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   20955            0 :   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
   20956            0 :     tmp = ix86_expand_sse_fabs (tmp, NULL);
   20957              : 
   20958              :   /* res = copysign (xa2, operand1) */
   20959            0 :   ix86_sse_copysign_to_positive (res, tmp, res, mask);
   20960              : 
   20961            0 :   emit_label (label);
   20962            0 :   LABEL_NUSES (label) = 1;
   20963              : 
   20964            0 :   emit_move_insn (operand0, res);
   20965            0 : }
   20966              : 
   20967              : /* Expand SSE sequence for computing round
   20968              :    from OPERAND1 storing into OPERAND0.  */
   20969              : void
   20970           14 : ix86_expand_round (rtx operand0, rtx operand1)
   20971              : {
   20972              :   /* C code for the stuff we're doing below:
   20973              :         double xa = fabs (x);
   20974              :         if (!isless (xa, TWO52))
   20975              :           return x;
   20976              :         xa = (double)(long)(xa + nextafter (0.5, 0.0));
   20977              :         return copysign (xa, x);
   20978              :    */
   20979           14 :   machine_mode mode = GET_MODE (operand0);
   20980           14 :   rtx res, TWO52, xa, xi, half, mask;
   20981           14 :   rtx_code_label *label;
   20982           14 :   const struct real_format *fmt;
   20983           14 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   20984              : 
   20985              :   /* Temporary for holding the result, initialized to the input
   20986              :      operand to ease control flow.  */
   20987           14 :   res = copy_to_reg (operand1);
   20988              : 
   20989           14 :   TWO52 = ix86_gen_TWO52 (mode);
   20990           14 :   xa = ix86_expand_sse_fabs (res, &mask);
   20991           14 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   20992              : 
   20993              :   /* load nextafter (0.5, 0.0) */
   20994           14 :   fmt = REAL_MODE_FORMAT (mode);
   20995           14 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   20996           14 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   20997              : 
   20998              :   /* xa = xa + 0.5 */
   20999           14 :   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
   21000           14 :   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
   21001              : 
   21002              :   /* xa = (double)(int64_t)xa */
   21003           14 :   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   21004           14 :   expand_fix (xi, xa, 0);
   21005           14 :   expand_float (xa, xi, 0);
   21006              : 
   21007              :   /* res = copysign (xa, operand1) */
   21008           14 :   ix86_sse_copysign_to_positive (res, xa, res, mask);
   21009              : 
   21010           14 :   emit_label (label);
   21011           14 :   LABEL_NUSES (label) = 1;
   21012              : 
   21013           14 :   emit_move_insn (operand0, res);
   21014           14 : }
   21015              : 
   21016              : /* Expand SSE sequence for computing round from OPERAND1 storing
   21017              :    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   21018              :    that is only available on 64bit targets.  */
   21019              : void
   21020            0 : ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
   21021              : {
   21022              :   /* C code for the stuff we expand below.
   21023              :         double xa = fabs (x), xa2, x2;
   21024              :         if (!isless (xa, TWO52))
   21025              :           return x;
   21026              :      Using the absolute value and copying back sign makes
   21027              :      -0.0 -> -0.0 correct.
   21028              :         xa2 = xa + TWO52 - TWO52;
   21029              :      Compensate.
   21030              :         dxa = xa2 - xa;
   21031              :         if (dxa <= -0.5)
   21032              :           xa2 += 1;
   21033              :         else if (dxa > 0.5)
   21034              :           xa2 -= 1;
   21035              :         x2 = copysign (xa2, x);
   21036              :         return x2;
   21037              :    */
   21038            0 :   machine_mode mode = GET_MODE (operand0);
   21039            0 :   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
   21040            0 :   rtx_code_label *label;
   21041              : 
   21042            0 :   TWO52 = ix86_gen_TWO52 (mode);
   21043              : 
   21044              :   /* Temporary for holding the result, initialized to the input
   21045              :      operand to ease control flow.  */
   21046            0 :   res = copy_to_reg (operand1);
   21047              : 
   21048              :   /* xa = abs (operand1) */
   21049            0 :   xa = ix86_expand_sse_fabs (res, &mask);
   21050              : 
   21051              :   /* if (!isless (xa, TWO52)) goto label; */
   21052            0 :   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   21053              : 
   21054              :   /* xa2 = xa + TWO52 - TWO52; */
   21055            0 :   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   21056            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
   21057              : 
   21058              :   /* dxa = xa2 - xa; */
   21059            0 :   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
   21060              : 
   21061              :   /* generate 0.5, 1.0 and -0.5 */
   21062            0 :   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
   21063            0 :   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
   21064            0 :   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
   21065              :                                0, OPTAB_DIRECT);
   21066              : 
   21067              :   /* Compensate.  */
   21068              :   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
   21069            0 :   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
   21070            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   21071            0 :   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   21072              :   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
   21073            0 :   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
   21074            0 :   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   21075            0 :   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   21076              : 
   21077              :   /* res = copysign (xa2, operand1) */
   21078            0 :   ix86_sse_copysign_to_positive (res, xa2, res, mask);
   21079              : 
   21080            0 :   emit_label (label);
   21081            0 :   LABEL_NUSES (label) = 1;
   21082              : 
   21083            0 :   emit_move_insn (operand0, res);
   21084            0 : }
   21085              : 
   21086              : /* Expand SSE sequence for computing round
   21087              :    from OP1 storing into OP0 using sse4 round insn.  */
   21088              : void
   21089            9 : ix86_expand_round_sse4 (rtx op0, rtx op1)
   21090              : {
   21091            9 :   machine_mode mode = GET_MODE (op0);
   21092            9 :   rtx e1, e2, res, half;
   21093            9 :   const struct real_format *fmt;
   21094            9 :   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   21095            9 :   rtx (*gen_copysign) (rtx, rtx, rtx);
   21096            9 :   rtx (*gen_round) (rtx, rtx, rtx);
   21097              : 
   21098            9 :   switch (mode)
   21099              :     {
   21100              :     case E_HFmode:
   21101              :       gen_copysign = gen_copysignhf3;
   21102              :       gen_round = gen_sse4_1_roundhf2;
   21103              :       break;
   21104            4 :     case E_SFmode:
   21105            4 :       gen_copysign = gen_copysignsf3;
   21106            4 :       gen_round = gen_sse4_1_roundsf2;
   21107            4 :       break;
   21108            4 :     case E_DFmode:
   21109            4 :       gen_copysign = gen_copysigndf3;
   21110            4 :       gen_round = gen_sse4_1_rounddf2;
   21111            4 :       break;
   21112            0 :     default:
   21113            0 :       gcc_unreachable ();
   21114              :     }
   21115              : 
   21116              :   /* round (a) = trunc (a + copysign (0.5, a)) */
   21117              : 
   21118              :   /* load nextafter (0.5, 0.0) */
   21119            9 :   fmt = REAL_MODE_FORMAT (mode);
   21120            9 :   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   21121            9 :   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   21122            9 :   half = const_double_from_real_value (pred_half, mode);
   21123              : 
   21124              :   /* e1 = copysign (0.5, op1) */
   21125            9 :   e1 = gen_reg_rtx (mode);
   21126            9 :   emit_insn (gen_copysign (e1, half, op1));
   21127              : 
   21128              :   /* e2 = op1 + e1 */
   21129            9 :   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
   21130              : 
   21131              :   /* res = trunc (e2) */
   21132            9 :   res = gen_reg_rtx (mode);
   21133            9 :   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
   21134              : 
   21135            9 :   emit_move_insn (op0, res);
   21136            9 : }
   21137              : 
   21138              : /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
   21139              :    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
   21140              :    insn every time.  */
   21141              : 
   21142              : static GTY(()) rtx_insn *vselect_insn;
   21143              : 
   21144              : /* Initialize vselect_insn.  */
   21145              : 
   21146              : static void
   21147         7489 : init_vselect_insn (void)
   21148              : {
   21149         7489 :   unsigned i;
   21150         7489 :   rtx x;
   21151              : 
   21152         7489 :   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
   21153       486785 :   for (i = 0; i < MAX_VECT_LEN; ++i)
   21154       479296 :     XVECEXP (x, 0, i) = const0_rtx;
   21155         7489 :   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
   21156              :                                                         const0_rtx), x);
   21157         7489 :   x = gen_rtx_SET (const0_rtx, x);
   21158         7489 :   start_sequence ();
   21159         7489 :   vselect_insn = emit_insn (x);
   21160         7489 :   end_sequence ();
   21161         7489 : }
   21162              : 
   21163              : /* Construct (set target (vec_select op0 (parallel perm))) and
   21164              :    return true if that's a valid instruction in the active ISA.  */
   21165              : 
   21166              : static bool
   21167       542572 : expand_vselect (rtx target, rtx op0, const unsigned char *perm,
   21168              :                 unsigned nelt, bool testing_p)
   21169              : {
   21170       542572 :   unsigned int i;
   21171       542572 :   rtx x, save_vconcat;
   21172       542572 :   int icode;
   21173              : 
   21174       542572 :   if (vselect_insn == NULL_RTX)
   21175         1662 :     init_vselect_insn ();
   21176              : 
   21177       542572 :   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
   21178       542572 :   PUT_NUM_ELEM (XVEC (x, 0), nelt);
   21179      4239146 :   for (i = 0; i < nelt; ++i)
   21180      3696574 :     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
   21181       542572 :   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
   21182       542572 :   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
   21183       542572 :   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
   21184       542572 :   SET_DEST (PATTERN (vselect_insn)) = target;
   21185       542572 :   icode = recog_memoized (vselect_insn);
   21186              : 
   21187       542572 :   if (icode >= 0 && !testing_p)
   21188        72503 :     emit_insn (copy_rtx (PATTERN (vselect_insn)));
   21189              : 
   21190       542572 :   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
   21191       542572 :   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
   21192       542572 :   INSN_CODE (vselect_insn) = -1;
   21193              : 
   21194       542572 :   return icode >= 0;
   21195              : }
   21196              : 
   21197              : /* Similar, but generate a vec_concat from op0 and op1 as well.  */
   21198              : 
   21199              : static bool
   21200       476027 : expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   21201              :                         const unsigned char *perm, unsigned nelt,
   21202              :                         bool testing_p)
   21203              : {
   21204       476027 :   machine_mode v2mode;
   21205       476027 :   rtx x;
   21206       476027 :   bool ok;
   21207              : 
   21208       476027 :   if (vselect_insn == NULL_RTX)
   21209         5827 :     init_vselect_insn ();
   21210              : 
   21211       476027 :   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
   21212              :     return false;
   21213       476027 :   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
   21214       476027 :   PUT_MODE (x, v2mode);
   21215       476027 :   XEXP (x, 0) = op0;
   21216       476027 :   XEXP (x, 1) = op1;
   21217       476027 :   ok = expand_vselect (target, x, perm, nelt, testing_p);
   21218       476027 :   XEXP (x, 0) = const0_rtx;
   21219       476027 :   XEXP (x, 1) = const0_rtx;
   21220       476027 :   return ok;
   21221              : }
   21222              : 
   21223              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21224              :    using movss or movsd.  */
   21225              : static bool
   21226       318690 : expand_vec_perm_movs (struct expand_vec_perm_d *d)
   21227              : {
   21228       318690 :   machine_mode vmode = d->vmode;
   21229       318690 :   unsigned i, nelt = d->nelt;
   21230       318690 :   rtx x;
   21231              : 
   21232       318690 :   if (d->one_operand_p)
   21233              :     return false;
   21234              : 
   21235       291960 :   if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
   21236       140436 :       && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
   21237        84903 :       && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
   21238              :     return false;
   21239              : 
   21240              :   /* Only the first element is changed.  */
   21241       215947 :   if (d->perm[0] != nelt && d->perm[0] != 0)
   21242              :     return false;
   21243       160484 :   for (i = 1; i < nelt; ++i)
   21244       125181 :     if (d->perm[i] != i + nelt - d->perm[0])
   21245              :       return false;
   21246              : 
   21247        35303 :   if (d->testing_p)
   21248              :     return true;
   21249              : 
   21250         6396 :   if (d->perm[0] == nelt)
   21251            0 :     x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
   21252              :   else
   21253         6396 :     x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
   21254              : 
   21255         6396 :   emit_insn (gen_rtx_SET (d->target, x));
   21256              : 
   21257         6396 :   return true;
   21258              : }
   21259              : 
   21260              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21261              :    using insertps.  */
   21262              : static bool
   21263       283387 : expand_vec_perm_insertps (struct expand_vec_perm_d *d)
   21264              : {
   21265       283387 :   machine_mode vmode = d->vmode;
   21266       283387 :   unsigned i, cnt_s, nelt = d->nelt;
   21267       283387 :   int cnt_d = -1;
   21268       283387 :   rtx src, dst;
   21269              : 
   21270       283387 :   if (d->one_operand_p)
   21271              :     return false;
   21272              : 
   21273       256657 :   if (!(TARGET_SSE4_1
   21274        37527 :         && (vmode == V4SFmode || vmode == V4SImode
   21275        27282 :             || (TARGET_MMX_WITH_SSE
   21276        19742 :                 && (vmode == V2SFmode || vmode == V2SImode)))))
   21277              :     return false;
   21278              : 
   21279        51004 :   for (i = 0; i < nelt; ++i)
   21280              :     {
   21281        48757 :       if (d->perm[i] == i)
   21282         9630 :         continue;
   21283        39127 :       if (cnt_d != -1)
   21284              :         {
   21285              :           cnt_d = -1;
   21286              :           break;
   21287              :         }
   21288        20687 :       cnt_d = i;
   21289              :     }
   21290              : 
   21291        20687 :   if (cnt_d == -1)
   21292              :     {
   21293        41056 :       for (i = 0; i < nelt; ++i)
   21294              :         {
   21295        39123 :           if (d->perm[i] == i + nelt)
   21296         4176 :             continue;
   21297        34947 :           if (cnt_d != -1)
   21298              :             return false;
   21299        18440 :           cnt_d = i;
   21300              :         }
   21301              : 
   21302         1933 :       if (cnt_d == -1)
   21303              :         return false;
   21304              :     }
   21305              : 
   21306         4180 :   if (d->testing_p)
   21307              :     return true;
   21308              : 
   21309          550 :   gcc_assert (cnt_d != -1);
   21310              : 
   21311          550 :   cnt_s = d->perm[cnt_d];
   21312          550 :   if (cnt_s < nelt)
   21313              :     {
   21314          241 :       src = d->op0;
   21315          241 :       dst = d->op1;
   21316              :     }
   21317              :   else
   21318              :     {
   21319          309 :       cnt_s -= nelt;
   21320          309 :       src = d->op1;
   21321          309 :       dst = d->op0;
   21322              :      }
   21323          550 :   gcc_assert (cnt_s < nelt);
   21324              : 
   21325          550 :   rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
   21326          550 :                                GEN_INT (cnt_s << 6 | cnt_d << 4));
   21327          550 :   emit_insn (x);
   21328              : 
   21329          550 :   return true;
   21330              : }
   21331              : 
   21332              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21333              :    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
   21334              : 
   21335              : static bool
   21336       323014 : expand_vec_perm_blend (struct expand_vec_perm_d *d)
   21337              : {
   21338       323014 :   machine_mode mmode, vmode = d->vmode;
   21339       323014 :   unsigned i, nelt = d->nelt;
   21340       323014 :   unsigned HOST_WIDE_INT mask;
   21341       323014 :   rtx target, op0, op1, maskop, x;
   21342       323014 :   rtx rperm[32], vperm;
   21343              : 
   21344       323014 :   if (d->one_operand_p)
   21345              :     return false;
   21346         6070 :   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
   21347       297435 :       && (TARGET_AVX512BW
   21348          691 :           || GET_MODE_UNIT_SIZE (vmode) >= 4))
   21349              :     ;
   21350       307345 :   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   21351              :     ;
   21352       288182 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   21353              :     ;
   21354       281599 :   else if (TARGET_SSE4_1
   21355       313819 :            && (GET_MODE_SIZE (vmode) == 16
   21356        22328 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   21357         3073 :                || GET_MODE_SIZE (vmode) == 4))
   21358              :     ;
   21359              :   else
   21360              :     return false;
   21361              : 
   21362              :   /* This is a blend, not a permute.  Elements must stay in their
   21363              :      respective lanes.  */
   21364       106352 :   for (i = 0; i < nelt; ++i)
   21365              :     {
   21366       102028 :       unsigned e = d->perm[i];
   21367       102028 :       if (!(e == i || e == i + nelt))
   21368              :         return false;
   21369              :     }
   21370              : 
   21371         4324 :   if (d->testing_p)
   21372              :     return true;
   21373              : 
   21374              :   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
   21375              :      decision should be extracted elsewhere, so that we only try that
   21376              :      sequence once all budget==3 options have been tried.  */
   21377         3012 :   target = d->target;
   21378         3012 :   op0 = d->op0;
   21379         3012 :   op1 = d->op1;
   21380         3012 :   mask = 0;
   21381              : 
   21382         3012 :   switch (vmode)
   21383              :     {
   21384              :     case E_V8DFmode:
   21385              :     case E_V16SFmode:
   21386              :     case E_V4DFmode:
   21387              :     case E_V8SFmode:
   21388              :     case E_V2DFmode:
   21389              :     case E_V4SFmode:
   21390              :     case E_V2SFmode:
   21391              :     case E_V2HImode:
   21392              :     case E_V4HImode:
   21393              :     case E_V8HImode:
   21394              :     case E_V8SImode:
   21395              :     case E_V32HImode:
   21396              :     case E_V64QImode:
   21397              :     case E_V16SImode:
   21398              :     case E_V8DImode:
   21399        10948 :       for (i = 0; i < nelt; ++i)
   21400         9462 :         mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
   21401              :       break;
   21402              : 
   21403              :     case E_V2DImode:
   21404           18 :       for (i = 0; i < 2; ++i)
   21405           18 :         mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
   21406            6 :       vmode = V8HImode;
   21407            6 :       goto do_subreg;
   21408              : 
   21409              :     case E_V2SImode:
   21410           24 :       for (i = 0; i < 2; ++i)
   21411           24 :         mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
   21412            8 :       vmode = V4HImode;
   21413            8 :       goto do_subreg;
   21414              : 
   21415          871 :     case E_V4SImode:
   21416          871 :       if (TARGET_AVX2)
   21417              :         {
   21418              :           /* Use vpblendd instead of vpblendw.  */
   21419          185 :           for (i = 0; i < nelt; ++i)
   21420          148 :             mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
   21421              :           break;
   21422              :         }
   21423              :       else
   21424              :         {
   21425         4170 :           for (i = 0; i < 4; ++i)
   21426         5200 :             mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
   21427          834 :           vmode = V8HImode;
   21428          834 :           goto do_subreg;
   21429              :         }
   21430              : 
   21431              :     case E_V16QImode:
   21432              :       /* See if bytes move in pairs so we can use pblendw with
   21433              :          an immediate argument, rather than pblendvb with a vector
   21434              :          argument.  */
   21435           93 :       for (i = 0; i < 16; i += 2)
   21436           92 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21437              :           {
   21438           83 :           use_pblendvb:
   21439         3502 :             for (i = 0; i < nelt; ++i)
   21440         3212 :               rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
   21441              : 
   21442          290 :           finish_pblendvb:
   21443          291 :             vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
   21444          291 :             vperm = force_reg (vmode, vperm);
   21445              : 
   21446          582 :             if (GET_MODE_SIZE (vmode) == 4)
   21447          135 :               emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
   21448          312 :             else if (GET_MODE_SIZE (vmode) == 8)
   21449           40 :               emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
   21450          232 :             else if (GET_MODE_SIZE (vmode) == 16)
   21451           83 :               emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
   21452              :             else
   21453           33 :               emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
   21454          291 :             if (target != d->target)
   21455            1 :               emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21456          291 :             return true;
   21457              :           }
   21458              : 
   21459            9 :       for (i = 0; i < 8; ++i)
   21460            8 :         mask |= (d->perm[i * 2] >= 16) << i;
   21461              :       vmode = V8HImode;
   21462              :       /* FALLTHRU */
   21463              : 
   21464         1166 :     do_subreg:
   21465         1166 :       target = gen_reg_rtx (vmode);
   21466         1166 :       op0 = gen_lowpart (vmode, op0);
   21467         1166 :       op1 = gen_lowpart (vmode, op1);
   21468         1166 :       break;
   21469              : 
   21470              :     case E_V8QImode:
   21471           40 :       for (i = 0; i < 8; i += 2)
   21472           40 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21473           40 :           goto use_pblendvb;
   21474              : 
   21475            0 :       for (i = 0; i < 4; ++i)
   21476            0 :         mask |= (d->perm[i * 2] >= 8) << i;
   21477            0 :       vmode = V4HImode;
   21478            0 :       goto do_subreg;
   21479              : 
   21480              :     case E_V4QImode:
   21481          153 :       for (i = 0; i < 4; i += 2)
   21482          150 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21483          135 :           goto use_pblendvb;
   21484              : 
   21485            9 :       for (i = 0; i < 2; ++i)
   21486            6 :         mask |= (d->perm[i * 2] >= 4) << i;
   21487            3 :       vmode = V2HImode;
   21488            3 :       goto do_subreg;
   21489              : 
   21490              :     case E_V32QImode:
   21491              :       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
   21492         4928 :       for (i = 0; i < 32; i += 2)
   21493         4640 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21494           32 :           goto use_pblendvb;
   21495              :       /* See if bytes move in quadruplets.  If yes, vpblendd
   21496              :          with immediate can be used.  */
   21497         2592 :       for (i = 0; i < 32; i += 4)
   21498         2304 :         if (d->perm[i] + 2 != d->perm[i + 2])
   21499              :           break;
   21500          288 :       if (i < 32)
   21501              :         {
   21502              :           /* See if bytes move the same in both lanes.  If yes,
   21503              :              vpblendw with immediate can be used.  */
   21504            0 :           for (i = 0; i < 16; i += 2)
   21505            0 :             if (d->perm[i] + 16 != d->perm[i + 16])
   21506            0 :               goto use_pblendvb;
   21507              : 
   21508              :           /* Use vpblendw.  */
   21509            0 :           for (i = 0; i < 16; ++i)
   21510            0 :             mask |= (d->perm[i * 2] >= 32) << i;
   21511            0 :           vmode = V16HImode;
   21512            0 :           goto do_subreg;
   21513              :         }
   21514              : 
   21515              :       /* Use vpblendd.  */
   21516         2592 :       for (i = 0; i < 8; ++i)
   21517         2304 :         mask |= (d->perm[i * 4] >= 32) << i;
   21518          288 :       vmode = V8SImode;
   21519          288 :       goto do_subreg;
   21520              : 
   21521              :     case E_V16HImode:
   21522              :       /* See if words move in pairs.  If yes, vpblendd can be used.  */
   21523          186 :       for (i = 0; i < 16; i += 2)
   21524          169 :         if (d->perm[i] + 1 != d->perm[i + 1])
   21525              :           break;
   21526           50 :       if (i < 16)
   21527              :         {
   21528              :           /* See if words move the same in both lanes.  If not,
   21529              :              vpblendvb must be used.  */
   21530          290 :           for (i = 0; i < 8; i++)
   21531          258 :             if (d->perm[i] + 8 != d->perm[i + 8])
   21532              :               {
   21533              :                 /* Use vpblendvb.  */
   21534           33 :                 for (i = 0; i < 32; ++i)
   21535           32 :                   rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
   21536              : 
   21537            1 :                 vmode = V32QImode;
   21538            1 :                 nelt = 32;
   21539            1 :                 target = gen_reg_rtx (vmode);
   21540            1 :                 op0 = gen_lowpart (vmode, op0);
   21541            1 :                 op1 = gen_lowpart (vmode, op1);
   21542            1 :                 goto finish_pblendvb;
   21543              :               }
   21544              : 
   21545              :           /* Use vpblendw.  */
   21546          544 :           for (i = 0; i < 16; ++i)
   21547          512 :             mask |= (d->perm[i] >= 16) << i;
   21548              :           break;
   21549              :         }
   21550              : 
   21551              :       /* Use vpblendd.  */
   21552          153 :       for (i = 0; i < 8; ++i)
   21553          136 :         mask |= (d->perm[i * 2] >= 16) << i;
   21554           17 :       vmode = V8SImode;
   21555           17 :       goto do_subreg;
   21556              : 
   21557              :     case E_V4DImode:
   21558              :       /* Use vpblendd.  */
   21559           45 :       for (i = 0; i < 4; ++i)
   21560           54 :         mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
   21561            9 :       vmode = V8SImode;
   21562            9 :       goto do_subreg;
   21563              : 
   21564            0 :     default:
   21565            0 :       gcc_unreachable ();
   21566              :     }
   21567              : 
   21568         2721 :   switch (vmode)
   21569              :     {
   21570              :     case E_V8DFmode:
   21571              :     case E_V8DImode:
   21572              :       mmode = QImode;
   21573              :       break;
   21574            5 :     case E_V16SFmode:
   21575            5 :     case E_V16SImode:
   21576            5 :       mmode = HImode;
   21577            5 :       break;
   21578            6 :     case E_V32HImode:
   21579            6 :       mmode = SImode;
   21580            6 :       break;
   21581            1 :     case E_V64QImode:
   21582            1 :       mmode = DImode;
   21583            1 :       break;
   21584              :     default:
   21585              :       mmode = VOIDmode;
   21586              :     }
   21587              : 
   21588              :   /* Canonicalize vec_merge.  */
   21589         2721 :   if (swap_commutative_operands_p (op1, op0)
   21590              :       /* Two operands have same precedence, then
   21591              :          first bit of mask select first operand.  */
   21592         2721 :       || (!swap_commutative_operands_p (op0, op1)
   21593         2721 :           && !(mask & 1)))
   21594              :     {
   21595         2714 :       unsigned n_elts = GET_MODE_NUNITS (vmode);
   21596         2714 :       std::swap (op0, op1);
   21597         2714 :       unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
   21598         2714 :       if (n_elts == HOST_BITS_PER_WIDE_INT)
   21599              :         mask_all  = -1;
   21600              :       else
   21601         2713 :         mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
   21602         2714 :       mask = ~mask & mask_all;
   21603              :     }
   21604              : 
   21605         2721 :   if (mmode != VOIDmode)
   21606           22 :     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
   21607              :   else
   21608         2699 :     maskop = GEN_INT (mask);
   21609              : 
   21610              :   /* This matches five different patterns with the different modes.  */
   21611         2721 :   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
   21612         2721 :   x = gen_rtx_SET (target, x);
   21613         2721 :   emit_insn (x);
   21614         2721 :   if (target != d->target)
   21615         1166 :     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21616              : 
   21617              :   return true;
   21618              : }
   21619              : 
   21620              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21621              :    in terms of the variable form of vpermilps.
   21622              : 
   21623              :    Note that we will have already failed the immediate input vpermilps,
   21624              :    which requires that the high and low part shuffle be identical; the
   21625              :    variable form doesn't require that.  */
   21626              : 
   21627              : static bool
   21628       138644 : expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
   21629              : {
   21630       138644 :   rtx rperm[8], vperm;
   21631       138644 :   unsigned i;
   21632              : 
   21633       138644 :   if (!TARGET_AVX || !d->one_operand_p
   21634        12421 :       || (d->vmode != V8SImode && d->vmode != V8SFmode))
   21635              :     return false;
   21636              : 
   21637              :   /* We can only permute within the 128-bit lane.  */
   21638        20283 :   for (i = 0; i < 8; ++i)
   21639              :     {
   21640        19345 :       unsigned e = d->perm[i];
   21641        19345 :       if (i < 4 ? e >= 4 : e < 4)
   21642              :         return false;
   21643              :     }
   21644              : 
   21645          938 :   if (d->testing_p)
   21646              :     return true;
   21647              : 
   21648          657 :   for (i = 0; i < 8; ++i)
   21649              :     {
   21650          584 :       unsigned e = d->perm[i];
   21651              : 
   21652              :       /* Within each 128-bit lane, the elements of op0 are numbered
   21653              :          from 0 and the elements of op1 are numbered from 4.  */
   21654          584 :       if (e >= 8 + 4)
   21655            0 :         e -= 8;
   21656          584 :       else if (e >= 4)
   21657          292 :         e -= 4;
   21658              : 
   21659          584 :       rperm[i] = GEN_INT (e);
   21660              :     }
   21661              : 
   21662           73 :   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
   21663           73 :   vperm = force_reg (V8SImode, vperm);
   21664           73 :   rtx target = d->target;
   21665           73 :   rtx op0 = d->op0;
   21666           73 :   if (d->vmode == V8SImode)
   21667              :     {
   21668           21 :       target = lowpart_subreg (V8SFmode, target, V8SImode);
   21669           21 :       op0 = lowpart_subreg (V8SFmode, op0, V8SImode);
   21670              :     }
   21671              : 
   21672           73 :   emit_insn (gen_avx_vpermilvarv8sf3 (target, op0, vperm));
   21673              : 
   21674           73 :   return true;
   21675              : }
   21676              : 
   21677              : /* For V*[QHS]Imode permutations, check if the same permutation
   21678              :    can't be performed in a 2x, 4x or 8x wider inner mode.  */
   21679              : 
   21680              : static bool
   21681       161403 : canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
   21682              :                               struct expand_vec_perm_d *nd)
   21683              : {
   21684       161403 :   int i;
   21685       161403 :   machine_mode mode = VOIDmode;
   21686              : 
   21687       161403 :   switch (d->vmode)
   21688              :     {
   21689              :     case E_V8QImode: mode = V4HImode; break;
   21690        29190 :     case E_V16QImode: mode = V8HImode; break;
   21691         1420 :     case E_V32QImode: mode = V16HImode; break;
   21692          315 :     case E_V64QImode: mode = V32HImode; break;
   21693        11589 :     case E_V4HImode: mode = V2SImode; break;
   21694        20226 :     case E_V8HImode: mode = V4SImode; break;
   21695         1001 :     case E_V16HImode: mode = V8SImode; break;
   21696          397 :     case E_V32HImode: mode = V16SImode; break;
   21697        42016 :     case E_V4SImode: mode = V2DImode; break;
   21698         1491 :     case E_V8SImode: mode = V4DImode; break;
   21699           65 :     case E_V16SImode: mode = V8DImode; break;
   21700              :     default: return false;
   21701              :     }
   21702       201459 :   for (i = 0; i < d->nelt; i += 2)
   21703       187429 :     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
   21704              :       return false;
   21705        14030 :   nd->vmode = mode;
   21706        14030 :   nd->nelt = d->nelt / 2;
   21707        92048 :   for (i = 0; i < nd->nelt; i++)
   21708        78018 :     nd->perm[i] = d->perm[2 * i] / 2;
   21709        28060 :   if (GET_MODE_INNER (mode) != DImode)
   21710        12337 :     canonicalize_vector_int_perm (nd, nd);
   21711        14030 :   if (nd != d)
   21712              :     {
   21713         8906 :       nd->one_operand_p = d->one_operand_p;
   21714         8906 :       nd->testing_p = d->testing_p;
   21715         8906 :       if (d->op0 == d->op1)
   21716         3001 :         nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
   21717              :       else
   21718              :         {
   21719         5905 :           nd->op0 = gen_lowpart (nd->vmode, d->op0);
   21720         5905 :           nd->op1 = gen_lowpart (nd->vmode, d->op1);
   21721              :         }
   21722         8906 :       if (d->testing_p)
   21723         5680 :         nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
   21724              :       else
   21725         3226 :         nd->target = gen_reg_rtx (nd->vmode);
   21726              :     }
   21727              :   return true;
   21728              : }
   21729              : 
   21730              : /* Return true if permutation D can be performed as VMODE permutation
   21731              :    instead.  */
   21732              : 
   21733              : static bool
   21734         7580 : valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
   21735              : {
   21736         7580 :   unsigned int i, j, chunk;
   21737              : 
   21738         7580 :   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
   21739         7580 :       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
   21740        18636 :       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
   21741              :     return false;
   21742              : 
   21743        11056 :   if (GET_MODE_NUNITS (vmode) >= d->nelt)
   21744              :     return true;
   21745              : 
   21746         5236 :   chunk = d->nelt / GET_MODE_NUNITS (vmode);
   21747         7186 :   for (i = 0; i < d->nelt; i += chunk)
   21748         6939 :     if (d->perm[i] & (chunk - 1))
   21749              :       return false;
   21750              :     else
   21751        12694 :       for (j = 1; j < chunk; ++j)
   21752        10744 :         if (d->perm[i] + j != d->perm[i + j])
   21753              :           return false;
   21754              : 
   21755              :   return true;
   21756              : }
   21757              : 
   21758              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   21759              :    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
   21760              : 
   21761              : static bool
   21762       137706 : expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   21763              : {
   21764       137706 :   unsigned i, nelt, eltsz, mask;
   21765       137706 :   unsigned char perm[64];
   21766       137706 :   machine_mode vmode;
   21767       137706 :   struct expand_vec_perm_d nd;
   21768       137706 :   rtx rperm[64], vperm, target, op0, op1;
   21769              : 
   21770       137706 :   nelt = d->nelt;
   21771              : 
   21772       137706 :   if (!d->one_operand_p)
   21773       224044 :     switch (GET_MODE_SIZE (d->vmode))
   21774              :       {
   21775         7810 :       case 4:
   21776         7810 :         if (!TARGET_XOP)
   21777              :           return false;
   21778              :         vmode = V4QImode;
   21779              :         break;
   21780              : 
   21781        18613 :       case 8:
   21782        18613 :         if (!TARGET_XOP)
   21783              :           return false;
   21784              :         vmode = V8QImode;
   21785              :         break;
   21786              : 
   21787        74960 :       case 16:
   21788        74960 :         if (!TARGET_XOP)
   21789              :           return false;
   21790              :         vmode = V16QImode;
   21791              :         break;
   21792              : 
   21793         9614 :       case 32:
   21794         9614 :         if (!TARGET_AVX2)
   21795              :           return false;
   21796              : 
   21797         4648 :         if (valid_perm_using_mode_p (V2TImode, d))
   21798              :           {
   21799           56 :             if (d->testing_p)
   21800              :               return true;
   21801              : 
   21802              :             /* Use vperm2i128 insn.  The pattern uses
   21803              :                V4DImode instead of V2TImode.  */
   21804           52 :             target = d->target;
   21805           52 :             if (d->vmode != V4DImode)
   21806           12 :               target = gen_reg_rtx (V4DImode);
   21807           52 :             op0 = gen_lowpart (V4DImode, d->op0);
   21808           52 :             op1 = gen_lowpart (V4DImode, d->op1);
   21809           52 :             rperm[0]
   21810           52 :               = GEN_INT ((d->perm[0] / (nelt / 2))
   21811              :                          | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
   21812           52 :             emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
   21813           52 :             if (target != d->target)
   21814           12 :               emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   21815           52 :             return true;
   21816              :           }
   21817              :         /* FALLTHRU */
   21818              : 
   21819              :       default:
   21820              :         return false;
   21821              :       }
   21822              :   else
   21823        51368 :     switch (GET_MODE_SIZE (d->vmode))
   21824              :       {
   21825         3455 :       case 4:
   21826         3455 :         if (!TARGET_SSSE3)
   21827              :           return false;
   21828              :         vmode = V4QImode;
   21829              :         break;
   21830              : 
   21831         2392 :       case 8:
   21832         2392 :         if (!TARGET_SSSE3)
   21833              :           return false;
   21834              :         vmode = V8QImode;
   21835              :         break;
   21836              : 
   21837        13926 :       case 16:
   21838        13926 :         if (!TARGET_SSSE3)
   21839              :           return false;
   21840              :         vmode = V16QImode;
   21841              :         break;
   21842              : 
   21843         5522 :       case 32:
   21844         5522 :         if (!TARGET_AVX2)
   21845              :           return false;
   21846              : 
   21847              :         /* V4DImode should be already handled through
   21848              :            expand_vselect by vpermq instruction.  */
   21849         2663 :         gcc_assert (d->vmode != V4DImode);
   21850              : 
   21851         2663 :         vmode = V32QImode;
   21852         2663 :         if (d->vmode == V8SImode
   21853         2270 :             || d->vmode == V16HImode
   21854         2054 :             || d->vmode == V32QImode)
   21855              :           {
   21856              :             /* First see if vpermq can be used for
   21857              :                V8SImode/V16HImode/V32QImode.  */
   21858         1379 :             if (valid_perm_using_mode_p (V4DImode, d))
   21859              :               {
   21860          770 :                 for (i = 0; i < 4; i++)
   21861          616 :                   perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
   21862          154 :                 if (d->testing_p)
   21863              :                   return true;
   21864           58 :                 target = gen_reg_rtx (V4DImode);
   21865           58 :                 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
   21866              :                                     perm, 4, false))
   21867              :                   {
   21868          116 :                     emit_move_insn (d->target,
   21869           58 :                                     gen_lowpart (d->vmode, target));
   21870           58 :                     return true;
   21871              :                   }
   21872              :                 return false;
   21873              :               }
   21874              : 
   21875              :             /* Next see if vpermd can be used.  */
   21876         1225 :             if (valid_perm_using_mode_p (V8SImode, d))
   21877              :               vmode = V8SImode;
   21878              :           }
   21879              :         /* Or if vpermps can be used.  */
   21880         1284 :         else if (d->vmode == V8SFmode)
   21881              :           vmode = V8SImode;
   21882              : 
   21883              :         if (vmode == V32QImode)
   21884              :           {
   21885              :             /* vpshufb only works intra lanes, it is not
   21886              :                possible to shuffle bytes in between the lanes.  */
   21887        22049 :             for (i = 0; i < nelt; ++i)
   21888        21395 :               if ((d->perm[i] ^ i) & (nelt / 2))
   21889              :                 return false;
   21890              :           }
   21891              :         break;
   21892              : 
   21893          389 :       case 64:
   21894          389 :         if (!TARGET_AVX512BW)
   21895              :           return false;
   21896              : 
   21897              :         /* If vpermq didn't work, vpshufb won't work either.  */
   21898          204 :         if (d->vmode == V8DFmode || d->vmode == V8DImode)
   21899              :           return false;
   21900              : 
   21901          175 :         vmode = V64QImode;
   21902          175 :         if (d->vmode == V16SImode
   21903          150 :             || d->vmode == V32HImode
   21904           50 :             || d->vmode == V64QImode)
   21905              :           {
   21906              :             /* First see if vpermq can be used for
   21907              :                V16SImode/V32HImode/V64QImode.  */
   21908          164 :             if (valid_perm_using_mode_p (V8DImode, d))
   21909              :               {
   21910            0 :                 for (i = 0; i < 8; i++)
   21911            0 :                   perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
   21912            0 :                 if (d->testing_p)
   21913              :                   return true;
   21914            0 :                 target = gen_reg_rtx (V8DImode);
   21915            0 :                 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
   21916              :                                     perm, 8, false))
   21917              :                   {
   21918            0 :                     emit_move_insn (d->target,
   21919            0 :                                     gen_lowpart (d->vmode, target));
   21920            0 :                     return true;
   21921              :                   }
   21922              :                 return false;
   21923              :               }
   21924              : 
   21925              :             /* Next see if vpermd can be used.  */
   21926          164 :             if (valid_perm_using_mode_p (V16SImode, d))
   21927              :               vmode = V16SImode;
   21928              :           }
   21929              :         /* Or if vpermps can be used.  */
   21930           11 :         else if (d->vmode == V16SFmode)
   21931              :           vmode = V16SImode;
   21932              : 
   21933              :         if (vmode == V64QImode)
   21934              :           {
   21935              :             /* vpshufb only works intra lanes, it is not
   21936              :                possible to shuffle bytes in between the lanes.  */
   21937          578 :             for (i = 0; i < nelt; ++i)
   21938          578 :               if ((d->perm[i] ^ i) & (3 * nelt / 4))
   21939              :                 return false;
   21940              :           }
   21941              :         break;
   21942              : 
   21943              :       default:
   21944              :         return false;
   21945              :       }
   21946              : 
   21947        12232 :   if (d->testing_p)
   21948              :     return true;
   21949              : 
   21950              :   /* Try to avoid variable permutation instruction.  */
   21951         9327 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   21952              :     {
   21953         1839 :       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   21954         1839 :       return true;
   21955              :     }
   21956              : 
   21957         7488 :   if (vmode == V8SImode)
   21958         9639 :     for (i = 0; i < 8; ++i)
   21959         8568 :       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
   21960         6417 :   else if (vmode == V16SImode)
   21961          612 :     for (i = 0; i < 16; ++i)
   21962          576 :       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
   21963              :   else
   21964              :     {
   21965         6381 :       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   21966         6381 :       if (!d->one_operand_p)
   21967         3210 :         mask = 2 * nelt - 1;
   21968         3171 :       else if (vmode == V64QImode)
   21969            0 :         mask = nelt / 4 - 1;
   21970         3171 :       else if (vmode == V32QImode)
   21971          648 :         mask = nelt / 2 - 1;
   21972              :       else
   21973         2523 :         mask = nelt - 1;
   21974              : 
   21975        74529 :       for (i = 0; i < nelt; ++i)
   21976              :         {
   21977        68148 :           unsigned j, e = d->perm[i] & mask;
   21978       178264 :           for (j = 0; j < eltsz; ++j)
   21979       110116 :             rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
   21980              :         }
   21981              :     }
   21982              : 
   21983         7488 :   machine_mode vpmode = vmode;
   21984              : 
   21985         7488 :   nelt = GET_MODE_SIZE (vmode);
   21986              : 
   21987              :   /* Emulate narrow modes with V16QI instructions.  */
   21988         7488 :   if (nelt < 16)
   21989              :     {
   21990          222 :       rtx m128 = GEN_INT (-128);
   21991              : 
   21992              :       /* Remap elements from the second operand, as we have to
   21993              :          account for inactive top elements from the first operand.  */
   21994          222 :       if (!d->one_operand_p)
   21995              :         {
   21996          243 :           for (i = 0; i < nelt; ++i)
   21997              :             {
   21998          216 :               unsigned ival = UINTVAL (rperm[i]);
   21999          216 :               if (ival >= nelt)
   22000          108 :                 rperm[i] = GEN_INT (ival + 16 - nelt);
   22001              :             }
   22002              :         }
   22003              : 
   22004              :       /* Fill inactive elements in the top positions with zeros.  */
   22005         2570 :       for (i = nelt; i < 16; ++i)
   22006         2348 :         rperm[i] = m128;
   22007              : 
   22008              :       vpmode = V16QImode;
   22009              :     }
   22010              : 
   22011        14976 :   vperm = gen_rtx_CONST_VECTOR (vpmode,
   22012         7488 :                                 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
   22013         7488 :   vperm = force_reg (vpmode, vperm);
   22014              : 
   22015         7488 :   if (vmode == d->vmode)
   22016         2893 :     target = d->target;
   22017              :   else
   22018         4595 :     target = gen_reg_rtx (vmode);
   22019              : 
   22020         7488 :   op0 = gen_lowpart (vmode, d->op0);
   22021              : 
   22022         7488 :   if (d->one_operand_p)
   22023              :     {
   22024         4278 :       rtx (*gen) (rtx, rtx, rtx);
   22025              : 
   22026         4278 :       if (vmode == V4QImode)
   22027              :         gen = gen_mmx_pshufbv4qi3;
   22028              :       else if (vmode == V8QImode)
   22029              :         gen = gen_mmx_pshufbv8qi3;
   22030              :       else if (vmode == V16QImode)
   22031              :         gen = gen_ssse3_pshufbv16qi3;
   22032              :       else if (vmode == V32QImode)
   22033              :         gen = gen_avx2_pshufbv32qi3;
   22034              :       else if (vmode == V64QImode)
   22035              :         gen = gen_avx512bw_pshufbv64qi3;
   22036              :       else if (vmode == V8SFmode)
   22037              :         gen = gen_avx2_permvarv8sf;
   22038              :       else if (vmode == V8SImode)
   22039              :         gen = gen_avx2_permvarv8si;
   22040              :       else if (vmode == V16SFmode)
   22041              :         gen = gen_avx512f_permvarv16sf;
   22042              :       else if (vmode == V16SImode)
   22043              :         gen = gen_avx512f_permvarv16si;
   22044              :       else
   22045              :         gcc_unreachable ();
   22046              : 
   22047         4278 :       emit_insn (gen (target, op0, vperm));
   22048              :     }
   22049              :   else
   22050              :     {
   22051         3210 :       rtx (*gen) (rtx, rtx, rtx, rtx);
   22052              : 
   22053         3210 :       op1 = gen_lowpart (vmode, d->op1);
   22054              : 
   22055         3210 :       if (vmode == V4QImode)
   22056              :         gen = gen_mmx_ppermv32;
   22057              :       else if (vmode == V8QImode)
   22058              :         gen = gen_mmx_ppermv64;
   22059              :       else if (vmode == V16QImode)
   22060              :         gen = gen_xop_pperm;
   22061              :       else
   22062            0 :         gcc_unreachable ();
   22063              : 
   22064         3210 :       emit_insn (gen (target, op0, op1, vperm));
   22065              :     }
   22066              : 
   22067         7488 :   if (target != d->target)
   22068         4595 :     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   22069              : 
   22070              :   return true;
   22071              : }
   22072              : 
   22073              : /* Try to expand one-operand permutation with constant mask.  */
   22074              : 
   22075              : static bool
   22076       125144 : ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
   22077              : {
   22078       125144 :   machine_mode mode = GET_MODE (d->op0);
   22079       125144 :   machine_mode maskmode = mode;
   22080       250288 :   unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
   22081       125144 :   rtx (*gen) (rtx, rtx, rtx) = NULL;
   22082       125144 :   rtx target, op0, mask;
   22083       125144 :   rtx vec[64];
   22084              : 
   22085       125144 :   if (!rtx_equal_p (d->op0, d->op1))
   22086              :     return false;
   22087              : 
   22088        17641 :   if (!TARGET_AVX512F)
   22089              :     return false;
   22090              : 
   22091              :   /* Accept VNxHImode and VNxQImode now.  */
   22092          719 :   if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
   22093              :     return false;
   22094              : 
   22095              :   /* vpermw.  */
   22096          457 :   if (!TARGET_AVX512BW && inner_size == 2)
   22097              :     return false;
   22098              : 
   22099              :   /* vpermb.  */
   22100          323 :   if (!TARGET_AVX512VBMI && inner_size == 1)
   22101              :     return false;
   22102              : 
   22103          202 :   switch (mode)
   22104              :     {
   22105              :     case E_V16SImode:
   22106              :       gen = gen_avx512f_permvarv16si;
   22107              :       break;
   22108            4 :     case E_V16SFmode:
   22109            4 :       gen = gen_avx512f_permvarv16sf;
   22110            4 :       maskmode = V16SImode;
   22111            4 :       break;
   22112            1 :     case E_V8DImode:
   22113            1 :       gen = gen_avx512f_permvarv8di;
   22114            1 :       break;
   22115           30 :     case E_V8DFmode:
   22116           30 :       gen = gen_avx512f_permvarv8df;
   22117           30 :       maskmode = V8DImode;
   22118           30 :       break;
   22119          108 :     case E_V32HImode:
   22120          108 :       gen = gen_avx512bw_permvarv32hi;
   22121          108 :       break;
   22122           14 :     case E_V16HImode:
   22123           14 :       gen = gen_avx512vl_permvarv16hi;
   22124           14 :       break;
   22125            6 :     case E_V8HImode:
   22126            6 :       gen = gen_avx512vl_permvarv8hi;
   22127            6 :       break;
   22128            4 :     case E_V64QImode:
   22129            4 :       gen = gen_avx512bw_permvarv64qi;
   22130            4 :       break;
   22131            2 :     case E_V32QImode:
   22132            2 :       gen = gen_avx512vl_permvarv32qi;
   22133            2 :       break;
   22134            0 :     case E_V16QImode:
   22135            0 :       gen = gen_avx512vl_permvarv16qi;
   22136            0 :       break;
   22137              : 
   22138              :     default:
   22139              :       return false;
   22140              :     }
   22141              : 
   22142          201 :   if (d->testing_p)
   22143              :     return true;
   22144              : 
   22145          192 :   target = d->target;
   22146          192 :   op0 = d->op0;
   22147         4920 :   for (int i = 0; i < d->nelt; ++i)
   22148         4728 :     vec[i] = GEN_INT (d->perm[i]);
   22149          192 :   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
   22150          192 :   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
   22151          192 :   return true;
   22152              : }
   22153              : 
   22154              : static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
   22155              : 
   22156              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
   22157              :    in a single instruction.  */
   22158              : 
   22159              : static bool
   22160       356858 : expand_vec_perm_1 (struct expand_vec_perm_d *d)
   22161              : {
   22162       356858 :   unsigned i, nelt = d->nelt;
   22163       356858 :   struct expand_vec_perm_d nd;
   22164              : 
   22165              :   /* Check plain VEC_SELECT first, because AVX has instructions that could
   22166              :      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
   22167              :      input where SEL+CONCAT may not.  */
   22168       356858 :   if (d->one_operand_p)
   22169              :     {
   22170              :       int mask = nelt - 1;
   22171              :       bool identity_perm = true;
   22172              :       bool broadcast_perm = true;
   22173              : 
   22174       527944 :       for (i = 0; i < nelt; i++)
   22175              :         {
   22176       464630 :           nd.perm[i] = d->perm[i] & mask;
   22177       464630 :           if (nd.perm[i] != i)
   22178       348901 :             identity_perm = false;
   22179       464630 :           if (nd.perm[i])
   22180       385619 :             broadcast_perm = false;
   22181              :         }
   22182              : 
   22183        63314 :       if (identity_perm)
   22184              :         {
   22185           59 :           if (!d->testing_p)
   22186            5 :             emit_move_insn (d->target, d->op0);
   22187           59 :           return true;
   22188              :         }
   22189        63255 :       else if (broadcast_perm && TARGET_AVX2)
   22190              :         {
   22191              :           /* Use vpbroadcast{b,w,d}.  */
   22192          390 :           rtx (*gen) (rtx, rtx) = NULL;
   22193          390 :           switch (d->vmode)
   22194              :             {
   22195            1 :             case E_V64QImode:
   22196            1 :               if (TARGET_AVX512BW)
   22197              :                 gen = gen_avx512bw_vec_dupv64qi_1;
   22198              :               break;
   22199            4 :             case E_V32QImode:
   22200            4 :               gen = gen_avx2_pbroadcastv32qi_1;
   22201            4 :               break;
   22202            1 :             case E_V32HImode:
   22203            1 :               if (TARGET_AVX512BW)
   22204              :                 gen = gen_avx512bw_vec_dupv32hi_1;
   22205              :               break;
   22206            4 :             case E_V16HImode:
   22207            4 :               gen = gen_avx2_pbroadcastv16hi_1;
   22208            4 :               break;
   22209            1 :             case E_V16SImode:
   22210            1 :               if (TARGET_AVX512F)
   22211              :                 gen = gen_avx512f_vec_dupv16si_1;
   22212              :               break;
   22213            4 :             case E_V8SImode:
   22214            4 :               gen = gen_avx2_pbroadcastv8si_1;
   22215            4 :               break;
   22216            4 :             case E_V16QImode:
   22217            4 :               gen = gen_avx2_pbroadcastv16qi;
   22218            4 :               break;
   22219            5 :             case E_V8HImode:
   22220            5 :               gen = gen_avx2_pbroadcastv8hi;
   22221            5 :               break;
   22222            0 :             case E_V16SFmode:
   22223            0 :               if (TARGET_AVX512F)
   22224              :                 gen = gen_avx512f_vec_dupv16sf_1;
   22225              :               break;
   22226              :             case E_V8SFmode:
   22227              :               gen = gen_avx2_vec_dupv8sf_1;
   22228              :               break;
   22229            0 :             case E_V8DFmode:
   22230            0 :               if (TARGET_AVX512F)
   22231              :                 gen = gen_avx512f_vec_dupv8df_1;
   22232              :               break;
   22233            0 :             case E_V8DImode:
   22234            0 :               if (TARGET_AVX512F)
   22235              :                 gen = gen_avx512f_vec_dupv8di_1;
   22236              :               break;
   22237              :             /* For other modes prefer other shuffles this function creates.  */
   22238              :             default: break;
   22239              :             }
   22240           21 :           if (gen != NULL)
   22241              :             {
   22242           24 :               if (!d->testing_p)
   22243           24 :                 emit_insn (gen (d->target, d->op0));
   22244           24 :               return true;
   22245              :             }
   22246              :         }
   22247              : 
   22248        63231 :       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
   22249              :         return true;
   22250              : 
   22251              :       /* There are plenty of patterns in sse.md that are written for
   22252              :          SEL+CONCAT and are not replicated for a single op.  Perhaps
   22253              :          that should be changed, to avoid the nastiness here.  */
   22254              : 
   22255              :       /* Recognize interleave style patterns, which means incrementing
   22256              :          every other permutation operand.  */
   22257       210419 :       for (i = 0; i < nelt; i += 2)
   22258              :         {
   22259       172987 :           nd.perm[i] = d->perm[i] & mask;
   22260       172987 :           nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
   22261              :         }
   22262        37432 :       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
   22263        37432 :                                   d->testing_p))
   22264              :         return true;
   22265              : 
   22266              :       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
   22267        32397 :       if (nelt >= 4)
   22268              :         {
   22269       113027 :           for (i = 0; i < nelt; i += 4)
   22270              :             {
   22271        80630 :               nd.perm[i + 0] = d->perm[i + 0] & mask;
   22272        80630 :               nd.perm[i + 1] = d->perm[i + 1] & mask;
   22273        80630 :               nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
   22274        80630 :               nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
   22275              :             }
   22276              : 
   22277        32397 :           if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
   22278        32397 :                                       d->testing_p))
   22279              :             return true;
   22280              :         }
   22281              :     }
   22282              : 
   22283              :   /* Try the SSE4.1 blend variable merge instructions.  */
   22284       320274 :   if (expand_vec_perm_blend (d))
   22285              :     return true;
   22286              : 
   22287              :   /* Try movss/movsd instructions.  */
   22288       318690 :   if (expand_vec_perm_movs (d))
   22289              :     return true;
   22290              : 
   22291              :   /* Try the SSE4.1 insertps instruction.  */
   22292       283387 :   if (expand_vec_perm_insertps (d))
   22293              :     return true;
   22294              : 
   22295              :   /* Try the fully general two operand permute.  */
   22296       279207 :   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
   22297       279207 :                               d->testing_p))
   22298              :     return true;
   22299              : 
   22300              :   /* Recognize interleave style patterns with reversed operands.  */
   22301       138656 :   if (!d->one_operand_p)
   22302              :     {
   22303       904434 :       for (i = 0; i < nelt; ++i)
   22304              :         {
   22305       792400 :           unsigned e = d->perm[i];
   22306       792400 :           if (e >= nelt)
   22307       388318 :             e -= nelt;
   22308              :           else
   22309       404082 :             e += nelt;
   22310       792400 :           nd.perm[i] = e;
   22311              :         }
   22312              : 
   22313       112034 :       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
   22314       112034 :                                   d->testing_p))
   22315              :         return true;
   22316              :     }
   22317              : 
   22318              :   /* Try one of the AVX vpermil variable permutations.  */
   22319       138644 :   if (expand_vec_perm_vpermil (d))
   22320              :     return true;
   22321              : 
   22322              :   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
   22323              :      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
   22324       137706 :   if (expand_vec_perm_pshufb (d))
   22325              :     return true;
   22326              : 
   22327              :   /* Try the AVX2 vpalignr instruction.  */
   22328       125264 :   if (expand_vec_perm_palignr (d, true))
   22329              :     return true;
   22330              : 
   22331              :   /* Try the AVX512F vperm{w,b,s,d} instructions  */
   22332       125144 :   if (ix86_expand_vec_one_operand_perm_avx512 (d))
   22333              :     return true;
   22334              : 
   22335              :   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
   22336       124943 :   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
   22337              :     return true;
   22338              : 
   22339              :   /* See if we can get the same permutation in different vector integer
   22340              :      mode.  */
   22341       123987 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   22342              :     {
   22343         6525 :       if (!d->testing_p)
   22344         1180 :         emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   22345         6525 :       return true;
   22346              :     }
   22347              :   return false;
   22348              : }
   22349              : 
   22350              : /* Canonicalize vec_perm index to make the first index
   22351              :    always comes from the first vector.  */
   22352              : static void
   22353         8157 : ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
   22354              : {
   22355         8157 :   unsigned nelt = d->nelt;
   22356         8157 :   if (d->perm[0] < nelt)
   22357              :     return;
   22358              : 
   22359            5 :   for (unsigned i = 0; i != nelt; i++)
   22360            4 :     d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
   22361              : 
   22362            1 :   std::swap (d->op0, d->op1);
   22363            1 :   return;
   22364              : }
   22365              : 
   22366              : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
   22367              :    in terms of a pair of shufps+ shufps/pshufd instructions.  */
   22368              : static bool
   22369        84971 : expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
   22370              : {
   22371        84971 :   unsigned char perm1[4];
   22372        84971 :   machine_mode vmode = d->vmode;
   22373        84971 :   bool ok;
   22374        84971 :   unsigned i, j, k, count = 0;
   22375              : 
   22376        84971 :   if (d->one_operand_p
   22377        79743 :       || (vmode != V4SImode && vmode != V4SFmode))
   22378              :     return false;
   22379              : 
   22380        36301 :   if (d->testing_p)
   22381              :     return true;
   22382              : 
   22383         8157 :   ix86_vec_perm_index_canon (d);
   22384        48942 :   for (i = 0; i < 4; ++i)
   22385        51069 :     count += d->perm[i] > 3 ? 1 : 0;
   22386              : 
   22387         8157 :   gcc_assert (count & 3);
   22388              : 
   22389         8157 :   rtx tmp = gen_reg_rtx (vmode);
   22390              :   /* 2 from op0 and 2 from op1.  */
   22391         8157 :   if (count == 2)
   22392              :     {
   22393              :       unsigned char perm2[4];
   22394        18110 :       for (i = 0, j = 0, k = 2; i < 4; ++i)
   22395        14488 :         if (d->perm[i] & 4)
   22396              :           {
   22397         7244 :             perm1[k++] = d->perm[i];
   22398         7244 :             perm2[i] = k - 1;
   22399              :           }
   22400              :         else
   22401              :           {
   22402         7244 :             perm1[j++] = d->perm[i];
   22403         7244 :             perm2[i] = j - 1;
   22404              :           }
   22405              : 
   22406              :       /* shufps.  */
   22407         7244 :       ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
   22408         3622 :                                   perm1, d->nelt, false);
   22409         3622 :       gcc_assert (ok);
   22410         3622 :       if (vmode == V4SImode && TARGET_SSE2)
   22411              :       /* pshufd.  */
   22412         2066 :         ok = expand_vselect (d->target, tmp,
   22413         2066 :                              perm2, d->nelt, false);
   22414              :       else
   22415              :         {
   22416              :           /* shufps.  */
   22417         1556 :           perm2[2] += 4;
   22418         1556 :           perm2[3] += 4;
   22419         1556 :           ok = expand_vselect_vconcat (d->target, tmp, tmp,
   22420         1556 :                                        perm2, d->nelt, false);
   22421              :         }
   22422         3622 :       gcc_assert (ok);
   22423              :     }
   22424              :   /* 3 from one op and 1 from another.  */
   22425              :   else
   22426              :     {
   22427        22675 :       unsigned pair_idx = 8, lone_idx = 8, shift;
   22428              : 
   22429              :       /* Find the lone index.  */
   22430        22675 :       for (i = 0; i < 4; ++i)
   22431        18140 :         if ((d->perm[i] > 3 && count == 1)
   22432        14809 :             || (d->perm[i] < 4 && count == 3))
   22433        18140 :           lone_idx = i;
   22434              : 
   22435              :       /* When lone_idx is not 0, it must from second op(count == 1).  */
   22436         5739 :       gcc_assert (count == (lone_idx ? 1 : 3));
   22437              : 
   22438              :       /* Find the pair index that sits in the same half as the lone index.  */
   22439         4535 :       shift = lone_idx & 2;
   22440         4535 :       pair_idx = 1 - lone_idx + 2 * shift;
   22441              : 
   22442              :       /* First permutate lone index and pair index into the same vector as
   22443              :          [ lone, lone, pair, pair ].  */
   22444         9070 :       perm1[1] = perm1[0]
   22445         4535 :         = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
   22446         9070 :       perm1[3] = perm1[2]
   22447         4535 :         = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
   22448              : 
   22449              :       /* Alway put the vector contains lone indx at the first.  */
   22450         4535 :       if (count == 1)
   22451         3331 :         std::swap (d->op0, d->op1);
   22452              : 
   22453              :       /* shufps.  */
   22454         9070 :       ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
   22455         4535 :                                    perm1, d->nelt, false);
   22456         4535 :       gcc_assert (ok);
   22457              : 
   22458              :       /* Refine lone and pair index to original order.  */
   22459         4535 :       perm1[shift] = lone_idx << 1;
   22460         4535 :       perm1[shift + 1] = pair_idx << 1;
   22461              : 
   22462              :       /* Select the remaining 2 elements in another vector.  */
   22463        13605 :       for (i = 2 - shift; i < 4 - shift; ++i)
   22464         9070 :         perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
   22465              : 
   22466              :       /* Adjust to original selector.  */
   22467         4535 :       if (lone_idx > 1)
   22468         2244 :         std::swap (tmp, d->op1);
   22469              : 
   22470              :       /* shufps.  */
   22471         9070 :       ok = expand_vselect_vconcat (d->target, tmp, d->op1,
   22472         4535 :                                    perm1, d->nelt, false);
   22473              : 
   22474         4535 :       gcc_assert (ok);
   22475              :     }
   22476              : 
   22477              :   return true;
   22478              : }
   22479              : 
   22480              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   22481              :    in terms of a pair of pshuflw + pshufhw instructions.  */
   22482              : 
   22483              : static bool
   22484       102329 : expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
   22485              : {
   22486       102329 :   unsigned char perm2[MAX_VECT_LEN];
   22487       102329 :   unsigned i;
   22488       102329 :   bool ok;
   22489              : 
   22490       102329 :   if (d->vmode != V8HImode || !d->one_operand_p)
   22491              :     return false;
   22492              : 
   22493              :   /* The two permutations only operate in 64-bit lanes.  */
   22494        12835 :   for (i = 0; i < 4; ++i)
   22495        10358 :     if (d->perm[i] >= 4)
   22496              :       return false;
   22497        12329 :   for (i = 4; i < 8; ++i)
   22498         9866 :     if (d->perm[i] < 4)
   22499              :       return false;
   22500              : 
   22501         2463 :   if (d->testing_p)
   22502              :     return true;
   22503              : 
   22504              :   /* Emit the pshuflw.  */
   22505          134 :   memcpy (perm2, d->perm, 4);
   22506          670 :   for (i = 4; i < 8; ++i)
   22507          536 :     perm2[i] = i;
   22508          134 :   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
   22509          134 :   gcc_assert (ok);
   22510              : 
   22511              :   /* Emit the pshufhw.  */
   22512          134 :   memcpy (perm2 + 4, d->perm + 4, 4);
   22513          670 :   for (i = 0; i < 4; ++i)
   22514          536 :     perm2[i] = i;
   22515          134 :   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
   22516          134 :   gcc_assert (ok);
   22517              : 
   22518              :   return true;
   22519              : }
   22520              : 
   22521              : /* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle.  */
   22522              : static bool
   22523        48670 : expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
   22524              : {
   22525        48670 :   if (GET_MODE_BITSIZE (d->vmode) != 64
   22526        15094 :       || !TARGET_MMX_WITH_SSE
   22527        63764 :       || d->one_operand_p)
   22528              :     return false;
   22529              : 
   22530        13699 :   machine_mode widen_vmode;
   22531        13699 :   switch (d->vmode)
   22532              :     {
   22533              :     /* pshufd.  */
   22534              :     case E_V2SImode:
   22535              :       widen_vmode = V4SImode;
   22536              :       break;
   22537              : 
   22538              :     /* pshufd.  */
   22539         1101 :     case E_V2SFmode:
   22540         1101 :       widen_vmode = V4SFmode;
   22541         1101 :       break;
   22542              : 
   22543         4663 :     case E_V4HImode:
   22544         4663 :       widen_vmode = V8HImode;
   22545              :       /* pshufb.  */
   22546         4663 :       if (!TARGET_SSSE3)
   22547              :         return false;
   22548              :       break;
   22549              : 
   22550         5560 :     case E_V8QImode:
   22551              :       /* pshufb.  */
   22552         5560 :       widen_vmode = V16QImode;
   22553         5560 :       if (!TARGET_SSSE3)
   22554              :         return false;
   22555              :       break;
   22556              : 
   22557              :     default:
   22558              :       return false;
   22559              :     }
   22560              : 
   22561         5272 :   if (d->testing_p)
   22562              :     return true;
   22563              : 
   22564          379 :   struct expand_vec_perm_d dperm;
   22565          379 :   dperm.target = gen_reg_rtx (widen_vmode);
   22566          379 :   rtx op0 = gen_reg_rtx (widen_vmode);
   22567          379 :   emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
   22568          379 :   dperm.op0 = op0;
   22569          379 :   dperm.op1 = op0;
   22570          379 :   dperm.vmode = widen_vmode;
   22571          379 :   unsigned nelt = GET_MODE_NUNITS (widen_vmode);
   22572          379 :   dperm.nelt = nelt;
   22573          379 :   dperm.one_operand_p = true;
   22574          379 :   dperm.testing_p = false;
   22575              : 
   22576         2009 :   for (unsigned i = 0; i != nelt / 2; i++)
   22577              :     {
   22578         1630 :       dperm.perm[i] = d->perm[i];
   22579         1630 :       dperm.perm[i + nelt / 2] = d->perm[i];
   22580              :     }
   22581              : 
   22582          379 :   gcc_assert (expand_vec_perm_1 (&dperm));
   22583          379 :   emit_move_insn (d->target, lowpart_subreg (d->vmode,
   22584              :                                              dperm.target,
   22585              :                                              dperm.vmode));
   22586          379 :   return true;
   22587              : }
   22588              : 
   22589              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22590              :    the permutation using the SSSE3 palignr instruction.  This succeeds
   22591              :    when all of the elements in PERM fit within one vector and we merely
   22592              :    need to shift them down so that a single vector permutation has a
   22593              :    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
   22594              :    the vpalignr instruction itself can perform the requested permutation.  */
   22595              : 
   22596              : static bool
   22597       225130 : expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
   22598              : {
   22599       225130 :   unsigned i, nelt = d->nelt;
   22600       225130 :   unsigned min, max, minswap, maxswap;
   22601       225130 :   bool in_order, ok, swap = false;
   22602       225130 :   rtx shift, target;
   22603       225130 :   struct expand_vec_perm_d dcopy;
   22604              : 
   22605              :   /* Even with AVX, palignr only operates on 128-bit vectors,
   22606              :      in AVX2 palignr operates on both 128-bit lanes.  */
   22607       120848 :   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
   22608       269958 :       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
   22609              :     return false;
   22610              : 
   22611        35543 :   min = 2 * nelt;
   22612        35543 :   max = 0;
   22613        35543 :   minswap = 2 * nelt;
   22614        35543 :   maxswap = 0;
   22615       259515 :   for (i = 0; i < nelt; ++i)
   22616              :     {
   22617       223972 :       unsigned e = d->perm[i];
   22618       223972 :       unsigned eswap = d->perm[i] ^ nelt;
   22619       447944 :       if (GET_MODE_SIZE (d->vmode) == 32)
   22620              :         {
   22621        89592 :           e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
   22622        89592 :           eswap = e ^ (nelt / 2);
   22623              :         }
   22624       223972 :       if (e < min)
   22625              :         min = e;
   22626       223972 :       if (e > max)
   22627              :         max = e;
   22628       223972 :       if (eswap < minswap)
   22629              :         minswap = eswap;
   22630       223972 :       if (eswap > maxswap)
   22631              :         maxswap = eswap;
   22632              :     }
   22633        35543 :   if (min == 0
   22634        51597 :       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
   22635              :     {
   22636        32347 :       if (d->one_operand_p
   22637        32078 :           || minswap == 0
   22638        68519 :           || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
   22639        18086 :                                    ? nelt / 2 : nelt))
   22640              :         return false;
   22641              :       swap = true;
   22642              :       min = minswap;
   22643         6416 :       max = maxswap;
   22644              :     }
   22645              : 
   22646              :   /* Given that we have SSSE3, we know we'll be able to implement the
   22647              :      single operand permutation after the palignr with pshufb for
   22648              :      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
   22649              :      first.  */
   22650         6466 :   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
   22651              :     return true;
   22652              : 
   22653         6416 :   dcopy = *d;
   22654         6416 :   if (swap)
   22655              :     {
   22656         3220 :       dcopy.op0 = d->op1;
   22657         3220 :       dcopy.op1 = d->op0;
   22658        16172 :       for (i = 0; i < nelt; ++i)
   22659        12952 :         dcopy.perm[i] ^= nelt;
   22660              :     }
   22661              : 
   22662              :   in_order = true;
   22663        32632 :   for (i = 0; i < nelt; ++i)
   22664              :     {
   22665        26216 :       unsigned e = dcopy.perm[i];
   22666        26216 :       if (GET_MODE_SIZE (d->vmode) == 32
   22667         1120 :           && e >= nelt
   22668        26466 :           && (e & (nelt / 2 - 1)) < min)
   22669          250 :         e = e - min - (nelt / 2);
   22670              :       else
   22671        25966 :         e = e - min;
   22672        26216 :       if (e != i)
   22673        19394 :         in_order = false;
   22674        26216 :       dcopy.perm[i] = e;
   22675              :     }
   22676         6416 :   dcopy.one_operand_p = true;
   22677              : 
   22678         6416 :   if (single_insn_only_p && !in_order)
   22679              :     return false;
   22680              : 
   22681              :   /* For AVX2, test whether we can permute the result in one instruction.  */
   22682         3267 :   if (d->testing_p)
   22683              :     {
   22684           50 :       if (in_order)
   22685              :         return true;
   22686            0 :       dcopy.op1 = dcopy.op0;
   22687            0 :       return expand_vec_perm_1 (&dcopy);
   22688              :     }
   22689              : 
   22690         6434 :   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
   22691         6434 :   if (GET_MODE_SIZE (d->vmode) == 16)
   22692              :     {
   22693         3145 :       target = gen_reg_rtx (V1TImode);
   22694         3145 :       emit_insn (gen_ssse3_palignrv1ti (target,
   22695         3145 :                                         gen_lowpart (V1TImode, dcopy.op1),
   22696         3145 :                                         gen_lowpart (V1TImode, dcopy.op0),
   22697              :                                         shift));
   22698              :     }
   22699              :   else
   22700              :     {
   22701           72 :       target = gen_reg_rtx (V2TImode);
   22702           72 :       emit_insn (gen_avx2_palignrv2ti (target,
   22703           72 :                                        gen_lowpart (V2TImode, dcopy.op1),
   22704           72 :                                        gen_lowpart (V2TImode, dcopy.op0),
   22705              :                                        shift));
   22706              :     }
   22707              : 
   22708         3217 :   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   22709              : 
   22710              :   /* Test for the degenerate case where the alignment by itself
   22711              :      produces the desired permutation.  */
   22712         3217 :   if (in_order)
   22713              :     {
   22714           70 :       emit_move_insn (d->target, dcopy.op0);
   22715           70 :       return true;
   22716              :     }
   22717              : 
   22718         3147 :   ok = expand_vec_perm_1 (&dcopy);
   22719         3159 :   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
   22720              : 
   22721              :   return ok;
   22722              : }
   22723              : 
   22724              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22725              :    the permutation using the SSE4_1 pblendv instruction.  Potentially
   22726              :    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
   22727              : 
   22728              : static bool
   22729        90184 : expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
   22730              : {
   22731        90184 :   unsigned i, which, nelt = d->nelt;
   22732        90184 :   struct expand_vec_perm_d dcopy, dcopy1;
   22733        90184 :   machine_mode vmode = d->vmode;
   22734        90184 :   bool ok;
   22735              : 
   22736              :   /* Use the same checks as in expand_vec_perm_blend.  */
   22737        90184 :   if (d->one_operand_p)
   22738              :     return false;
   22739        89007 :   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   22740              :     ;
   22741        82777 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   22742              :     ;
   22743        79006 :   else if (TARGET_SSE4_1
   22744        89187 :            && (GET_MODE_SIZE (vmode) == 16
   22745         8484 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   22746         2559 :                || GET_MODE_SIZE (vmode) == 4))
   22747              :     ;
   22748              :   else
   22749              :     return false;
   22750              : 
   22751              :   /* Figure out where permutation elements stay not in their
   22752              :      respective lanes.  */
   22753       119408 :   for (i = 0, which = 0; i < nelt; ++i)
   22754              :     {
   22755       103648 :       unsigned e = d->perm[i];
   22756       103648 :       if (e != i)
   22757       141880 :         which |= (e < nelt ? 1 : 2);
   22758              :     }
   22759              :   /* We can pblend the part where elements stay not in their
   22760              :      respective lanes only when these elements are all in one
   22761              :      half of a permutation.
   22762              :      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
   22763              :      lanes, but both 8 and 9 >= 8
   22764              :      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
   22765              :      respective lanes and 8 >= 8, but 2 not.  */
   22766        15760 :   if (which != 1 && which != 2)
   22767              :     return false;
   22768         3361 :   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
   22769              :     return true;
   22770              : 
   22771              :   /* First we apply one operand permutation to the part where
   22772              :      elements stay not in their respective lanes.  */
   22773         2051 :   dcopy = *d;
   22774         2051 :   if (which == 2)
   22775         2051 :     dcopy.op0 = dcopy.op1 = d->op1;
   22776              :   else
   22777            0 :     dcopy.op0 = dcopy.op1 = d->op0;
   22778         2051 :   if (!d->testing_p)
   22779          741 :     dcopy.target = gen_reg_rtx (vmode);
   22780         2051 :   dcopy.one_operand_p = true;
   22781              : 
   22782        16603 :   for (i = 0; i < nelt; ++i)
   22783        14552 :     dcopy.perm[i] = d->perm[i] & (nelt - 1);
   22784              : 
   22785         2051 :   ok = expand_vec_perm_1 (&dcopy);
   22786         4102 :   if (GET_MODE_SIZE (vmode) != 16 && !ok)
   22787              :     return false;
   22788              :   else
   22789         1756 :     gcc_assert (ok);
   22790         1756 :   if (d->testing_p)
   22791              :     return true;
   22792              : 
   22793              :   /* Next we put permuted elements into their positions.  */
   22794          679 :   dcopy1 = *d;
   22795          679 :   if (which == 2)
   22796          679 :     dcopy1.op1 = dcopy.target;
   22797              :   else
   22798            0 :     dcopy1.op0 = dcopy.target;
   22799              : 
   22800         5751 :   for (i = 0; i < nelt; ++i)
   22801         5072 :     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
   22802              : 
   22803          679 :   ok = expand_vec_perm_blend (&dcopy1);
   22804          679 :   gcc_assert (ok);
   22805              : 
   22806              :   return true;
   22807              : }
   22808              : 
   22809              : static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
   22810              : 
   22811              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   22812              :    a two vector permutation into a single vector permutation by using
   22813              :    an interleave operation to merge the vectors.  */
   22814              : 
   22815              : static bool
   22816        96731 : expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
   22817              : {
   22818        96731 :   struct expand_vec_perm_d dremap, dfinal;
   22819        96731 :   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
   22820        96731 :   unsigned HOST_WIDE_INT contents;
   22821        96731 :   unsigned char remap[2 * MAX_VECT_LEN];
   22822        96731 :   rtx_insn *seq;
   22823        96731 :   bool ok, same_halves = false;
   22824              : 
   22825        96731 :   if (GET_MODE_SIZE (d->vmode) == 4
   22826       174560 :       || GET_MODE_SIZE (d->vmode) == 8
   22827       237383 :       || GET_MODE_SIZE (d->vmode) == 16)
   22828              :     {
   22829        89330 :       if (d->one_operand_p)
   22830              :         return false;
   22831              :     }
   22832        14802 :   else if (GET_MODE_SIZE (d->vmode) == 32)
   22833              :     {
   22834         7049 :       if (!TARGET_AVX)
   22835              :         return false;
   22836              :       /* For 32-byte modes allow even d->one_operand_p.
   22837              :          The lack of cross-lane shuffling in some instructions
   22838              :          might prevent a single insn shuffle.  */
   22839         7049 :       dfinal = *d;
   22840         7049 :       dfinal.testing_p = true;
   22841              :       /* If expand_vec_perm_interleave3 can expand this into
   22842              :          a 3 insn sequence, give up and let it be expanded as
   22843              :          3 insn sequence.  While that is one insn longer,
   22844              :          it doesn't need a memory operand and in the common
   22845              :          case that both interleave low and high permutations
   22846              :          with the same operands are adjacent needs 4 insns
   22847              :          for both after CSE.  */
   22848         7049 :       if (expand_vec_perm_interleave3 (&dfinal))
   22849              :         return false;
   22850              :     }
   22851              :   else
   22852              :     return false;
   22853              : 
   22854              :   /* Examine from whence the elements come.  */
   22855        91013 :   contents = 0;
   22856       686429 :   for (i = 0; i < nelt; ++i)
   22857       595416 :     contents |= HOST_WIDE_INT_1U << d->perm[i];
   22858              : 
   22859        91013 :   memset (remap, 0xff, sizeof (remap));
   22860        91013 :   dremap = *d;
   22861              : 
   22862        91013 :   if (GET_MODE_SIZE (d->vmode) == 4
   22863       174240 :       || GET_MODE_SIZE (d->vmode) == 8)
   22864              :     {
   22865        23341 :       unsigned HOST_WIDE_INT h1, h2, h3, h4;
   22866              : 
   22867              :       /* Split the two input vectors into 4 halves.  */
   22868        23341 :       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
   22869        23341 :       h2 = h1 << nelt2;
   22870        23341 :       h3 = h2 << nelt2;
   22871        23341 :       h4 = h3 << nelt2;
   22872              : 
   22873              :       /* If the elements from the low halves use interleave low,
   22874              :          and similarly for interleave high.  */
   22875        23341 :       if ((contents & (h1 | h3)) == contents)
   22876              :         {
   22877              :           /* punpckl* */
   22878         3247 :           for (i = 0; i < nelt2; ++i)
   22879              :             {
   22880         2292 :               remap[i] = i * 2;
   22881         2292 :               remap[i + nelt] = i * 2 + 1;
   22882         2292 :               dremap.perm[i * 2] = i;
   22883         2292 :               dremap.perm[i * 2 + 1] = i + nelt;
   22884              :             }
   22885              :         }
   22886        22386 :       else if ((contents & (h2 | h4)) == contents)
   22887              :         {
   22888              :           /* punpckh* */
   22889         2836 :           for (i = 0; i < nelt2; ++i)
   22890              :             {
   22891         2000 :               remap[i + nelt2] = i * 2;
   22892         2000 :               remap[i + nelt + nelt2] = i * 2 + 1;
   22893         2000 :               dremap.perm[i * 2] = i + nelt2;
   22894         2000 :               dremap.perm[i * 2 + 1] = i + nelt + nelt2;
   22895              :             }
   22896              :         }
   22897              :       else
   22898              :         return false;
   22899              :     }
   22900       135344 :   else if (GET_MODE_SIZE (d->vmode) == 16)
   22901              :     {
   22902        60841 :       unsigned HOST_WIDE_INT h1, h2, h3, h4;
   22903              : 
   22904              :       /* Split the two input vectors into 4 halves.  */
   22905        60841 :       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
   22906        60841 :       h2 = h1 << nelt2;
   22907        60841 :       h3 = h2 << nelt2;
   22908        60841 :       h4 = h3 << nelt2;
   22909              : 
   22910              :       /* If the elements from the low halves use interleave low, and similarly
   22911              :          for interleave high.  If the elements are from mis-matched halves, we
   22912              :          can use shufps for V4SF/V4SI or do a DImode shuffle.  */
   22913        60841 :       if ((contents & (h1 | h3)) == contents)
   22914              :         {
   22915              :           /* punpckl* */
   22916         5923 :           for (i = 0; i < nelt2; ++i)
   22917              :             {
   22918         4382 :               remap[i] = i * 2;
   22919         4382 :               remap[i + nelt] = i * 2 + 1;
   22920         4382 :               dremap.perm[i * 2] = i;
   22921         4382 :               dremap.perm[i * 2 + 1] = i + nelt;
   22922              :             }
   22923         1541 :           if (!TARGET_SSE2 && d->vmode == V4SImode)
   22924            0 :             dremap.vmode = V4SFmode;
   22925              :         }
   22926        59300 :       else if ((contents & (h2 | h4)) == contents)
   22927              :         {
   22928              :           /* punpckh* */
   22929         5130 :           for (i = 0; i < nelt2; ++i)
   22930              :             {
   22931         3762 :               remap[i + nelt2] = i * 2;
   22932         3762 :               remap[i + nelt + nelt2] = i * 2 + 1;
   22933         3762 :               dremap.perm[i * 2] = i + nelt2;
   22934         3762 :               dremap.perm[i * 2 + 1] = i + nelt + nelt2;
   22935              :             }
   22936         1368 :           if (!TARGET_SSE2 && d->vmode == V4SImode)
   22937            0 :             dremap.vmode = V4SFmode;
   22938              :         }
   22939        57932 :       else if ((contents & (h1 | h4)) == contents)
   22940              :         {
   22941              :           /* shufps */
   22942         2537 :           for (i = 0; i < nelt2; ++i)
   22943              :             {
   22944         1828 :               remap[i] = i;
   22945         1828 :               remap[i + nelt + nelt2] = i + nelt2;
   22946         1828 :               dremap.perm[i] = i;
   22947         1828 :               dremap.perm[i + nelt2] = i + nelt + nelt2;
   22948              :             }
   22949          709 :           if (nelt != 4)
   22950              :             {
   22951              :               /* shufpd */
   22952           69 :               dremap.vmode = V2DImode;
   22953           69 :               dremap.nelt = 2;
   22954           69 :               dremap.perm[0] = 0;
   22955           69 :               dremap.perm[1] = 3;
   22956              :             }
   22957              :         }
   22958        57223 :       else if ((contents & (h2 | h3)) == contents)
   22959              :         {
   22960              :           /* shufps */
   22961         3423 :           for (i = 0; i < nelt2; ++i)
   22962              :             {
   22963         2410 :               remap[i + nelt2] = i;
   22964         2410 :               remap[i + nelt] = i + nelt2;
   22965         2410 :               dremap.perm[i] = i + nelt2;
   22966         2410 :               dremap.perm[i + nelt2] = i + nelt;
   22967              :             }
   22968         1013 :           if (nelt != 4)
   22969              :             {
   22970              :               /* shufpd */
   22971           64 :               dremap.vmode = V2DImode;
   22972           64 :               dremap.nelt = 2;
   22973           64 :               dremap.perm[0] = 1;
   22974           64 :               dremap.perm[1] = 2;
   22975              :             }
   22976              :         }
   22977              :       else
   22978              :         return false;
   22979              :     }
   22980              :   else
   22981              :     {
   22982         6831 :       unsigned int nelt4 = nelt / 4, nzcnt = 0;
   22983         6831 :       unsigned HOST_WIDE_INT q[8];
   22984         6831 :       unsigned int nonzero_halves[4];
   22985              : 
   22986              :       /* Split the two input vectors into 8 quarters.  */
   22987         6831 :       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
   22988        54648 :       for (i = 1; i < 8; ++i)
   22989        47817 :         q[i] = q[0] << (nelt4 * i);
   22990        34155 :       for (i = 0; i < 4; ++i)
   22991        27324 :         if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
   22992              :           {
   22993        24487 :             nonzero_halves[nzcnt] = i;
   22994        24487 :             ++nzcnt;
   22995              :           }
   22996              : 
   22997         6831 :       if (nzcnt == 1)
   22998              :         {
   22999          221 :           gcc_assert (d->one_operand_p);
   23000          221 :           nonzero_halves[1] = nonzero_halves[0];
   23001          221 :           same_halves = true;
   23002              :         }
   23003         6610 :       else if (d->one_operand_p)
   23004              :         {
   23005           23 :           gcc_assert (nonzero_halves[0] == 0);
   23006           23 :           gcc_assert (nonzero_halves[1] == 1);
   23007              :         }
   23008              : 
   23009         6831 :       if (nzcnt <= 2)
   23010              :         {
   23011          544 :           if (d->perm[0] / nelt2 == nonzero_halves[1])
   23012              :             {
   23013              :               /* Attempt to increase the likelihood that dfinal
   23014              :                  shuffle will be intra-lane.  */
   23015          229 :               std::swap (nonzero_halves[0], nonzero_halves[1]);
   23016              :             }
   23017              : 
   23018              :           /* vperm2f128 or vperm2i128.  */
   23019         3526 :           for (i = 0; i < nelt2; ++i)
   23020              :             {
   23021         2982 :               remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
   23022         2982 :               remap[i + nonzero_halves[0] * nelt2] = i;
   23023         2982 :               dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
   23024         2982 :               dremap.perm[i] = i + nonzero_halves[0] * nelt2;
   23025              :             }
   23026              : 
   23027          544 :           if (d->vmode != V8SFmode
   23028              :               && d->vmode != V4DFmode
   23029              :               && d->vmode != V8SImode)
   23030              :             {
   23031          132 :               dremap.vmode = V8SImode;
   23032          132 :               dremap.nelt = 8;
   23033          660 :               for (i = 0; i < 4; ++i)
   23034              :                 {
   23035          528 :                   dremap.perm[i] = i + nonzero_halves[0] * 4;
   23036          528 :                   dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
   23037              :                 }
   23038              :             }
   23039              :         }
   23040         6287 :       else if (d->one_operand_p)
   23041         5822 :         return false;
   23042         6287 :       else if (TARGET_AVX2
   23043         2600 :                && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
   23044              :         {
   23045              :           /* vpunpckl* */
   23046          491 :           for (i = 0; i < nelt4; ++i)
   23047              :             {
   23048          247 :               remap[i] = i * 2;
   23049          247 :               remap[i + nelt] = i * 2 + 1;
   23050          247 :               remap[i + nelt2] = i * 2 + nelt2;
   23051          247 :               remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
   23052          247 :               dremap.perm[i * 2] = i;
   23053          247 :               dremap.perm[i * 2 + 1] = i + nelt;
   23054          247 :               dremap.perm[i * 2 + nelt2] = i + nelt2;
   23055          247 :               dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
   23056              :             }
   23057              :         }
   23058         6043 :       else if (TARGET_AVX2
   23059         2356 :                && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
   23060              :         {
   23061              :           /* vpunpckh* */
   23062          445 :           for (i = 0; i < nelt4; ++i)
   23063              :             {
   23064          224 :               remap[i + nelt4] = i * 2;
   23065          224 :               remap[i + nelt + nelt4] = i * 2 + 1;
   23066          224 :               remap[i + nelt2 + nelt4] = i * 2 + nelt2;
   23067          224 :               remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
   23068          224 :               dremap.perm[i * 2] = i + nelt4;
   23069          224 :               dremap.perm[i * 2 + 1] = i + nelt + nelt4;
   23070          224 :               dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
   23071          224 :               dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
   23072              :             }
   23073              :         }
   23074              :       else
   23075              :         return false;
   23076              :     }
   23077              : 
   23078              :   /* Use the remapping array set up above to move the elements from their
   23079              :      swizzled locations into their final destinations.  */
   23080         7431 :   dfinal = *d;
   23081        48627 :   for (i = 0; i < nelt; ++i)
   23082              :     {
   23083        41196 :       unsigned e = remap[d->perm[i]];
   23084        41196 :       gcc_assert (e < nelt);
   23085              :       /* If same_halves is true, both halves of the remapped vector are the
   23086              :          same.  Avoid cross-lane accesses if possible.  */
   23087        41196 :       if (same_halves && i >= nelt2)
   23088              :         {
   23089          816 :           gcc_assert (e < nelt2);
   23090          816 :           dfinal.perm[i] = e + nelt2;
   23091              :         }
   23092              :       else
   23093        40380 :         dfinal.perm[i] = e;
   23094              :     }
   23095         7431 :   if (!d->testing_p)
   23096              :     {
   23097         2773 :       dremap.target = gen_reg_rtx (dremap.vmode);
   23098         2773 :       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   23099              :     }
   23100         7431 :   dfinal.op1 = dfinal.op0;
   23101         7431 :   dfinal.one_operand_p = true;
   23102              : 
   23103              :   /* Test if the final remap can be done with a single insn.  For V4SFmode or
   23104              :      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
   23105         7431 :   start_sequence ();
   23106         7431 :   ok = expand_vec_perm_1 (&dfinal);
   23107         7431 :   seq = end_sequence ();
   23108              : 
   23109         7431 :   if (!ok)
   23110              :     return false;
   23111              : 
   23112         6383 :   if (d->testing_p)
   23113              :     return true;
   23114              : 
   23115         2734 :   if (dremap.vmode != dfinal.vmode)
   23116              :     {
   23117           53 :       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
   23118           53 :       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
   23119              :     }
   23120              : 
   23121         2734 :   ok = expand_vec_perm_1 (&dremap);
   23122         2734 :   gcc_assert (ok);
   23123              : 
   23124         2734 :   emit_insn (seq);
   23125         2734 :   return true;
   23126              : }
   23127              : 
   23128              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23129              :    a single vector cross-lane permutation into vpermq followed
   23130              :    by any of the single insn permutations.  */
   23131              : 
   23132              : static bool
   23133        90248 : expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
   23134              : {
   23135        90248 :   struct expand_vec_perm_d dremap, dfinal;
   23136        90248 :   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
   23137        90248 :   unsigned contents[2];
   23138        90248 :   bool ok;
   23139              : 
   23140        90248 :   if (!(TARGET_AVX2
   23141         4277 :         && (d->vmode == V32QImode || d->vmode == V16HImode)
   23142          495 :         && d->one_operand_p))
   23143              :     return false;
   23144              : 
   23145            7 :   contents[0] = 0;
   23146            7 :   contents[1] = 0;
   23147          103 :   for (i = 0; i < nelt2; ++i)
   23148              :     {
   23149           96 :       contents[0] |= 1u << (d->perm[i] / nelt4);
   23150           96 :       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
   23151              :     }
   23152              : 
   23153            7 :   for (i = 0; i < 2; ++i)
   23154              :     {
   23155              :       unsigned int cnt = 0;
   23156           21 :       for (j = 0; j < 4; ++j)
   23157           21 :         if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
   23158              :           return false;
   23159              :     }
   23160              : 
   23161            0 :   if (d->testing_p)
   23162              :     return true;
   23163              : 
   23164            0 :   dremap = *d;
   23165            0 :   dremap.vmode = V4DImode;
   23166            0 :   dremap.nelt = 4;
   23167            0 :   dremap.target = gen_reg_rtx (V4DImode);
   23168            0 :   dremap.op0 = gen_lowpart (V4DImode, d->op0);
   23169            0 :   dremap.op1 = dremap.op0;
   23170            0 :   dremap.one_operand_p = true;
   23171            0 :   for (i = 0; i < 2; ++i)
   23172              :     {
   23173              :       unsigned int cnt = 0;
   23174            0 :       for (j = 0; j < 4; ++j)
   23175            0 :         if ((contents[i] & (1u << j)) != 0)
   23176            0 :           dremap.perm[2 * i + cnt++] = j;
   23177            0 :       for (; cnt < 2; ++cnt)
   23178            0 :         dremap.perm[2 * i + cnt] = 0;
   23179              :     }
   23180              : 
   23181            0 :   dfinal = *d;
   23182            0 :   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   23183            0 :   dfinal.op1 = dfinal.op0;
   23184            0 :   dfinal.one_operand_p = true;
   23185            0 :   for (i = 0, j = 0; i < nelt; ++i)
   23186              :     {
   23187            0 :       if (i == nelt2)
   23188            0 :         j = 2;
   23189            0 :       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
   23190            0 :       if ((d->perm[i] / nelt4) == dremap.perm[j])
   23191              :         ;
   23192            0 :       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
   23193            0 :         dfinal.perm[i] |= nelt4;
   23194              :       else
   23195            0 :         gcc_unreachable ();
   23196              :     }
   23197              : 
   23198            0 :   ok = expand_vec_perm_1 (&dremap);
   23199            0 :   gcc_assert (ok);
   23200              : 
   23201            0 :   ok = expand_vec_perm_1 (&dfinal);
   23202            0 :   gcc_assert (ok);
   23203              : 
   23204              :   return true;
   23205              : }
   23206              : 
   23207              : static bool canonicalize_perm (struct expand_vec_perm_d *d);
   23208              : 
   23209              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
   23210              :    a vector permutation using two instructions, vperm2f128 resp.
   23211              :    vperm2i128 followed by any single in-lane permutation.  */
   23212              : 
   23213              : static bool
   23214        90248 : expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
   23215              : {
   23216        90248 :   struct expand_vec_perm_d dfirst, dsecond;
   23217        90248 :   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
   23218        90248 :   bool ok;
   23219              : 
   23220        90248 :   if (!TARGET_AVX
   23221        23226 :       || GET_MODE_SIZE (d->vmode) != 32
   23222        96461 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
   23223              :     return false;
   23224              : 
   23225         6029 :   dsecond = *d;
   23226         6029 :   dsecond.one_operand_p = false;
   23227         6029 :   dsecond.testing_p = true;
   23228              : 
   23229              :   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
   23230              :      immediate.  For perm < 16 the second permutation uses
   23231              :      d->op0 as first operand, for perm >= 16 it uses d->op1
   23232              :      as first operand.  The second operand is the result of
   23233              :      vperm2[fi]128.  */
   23234       197735 :   for (perm = 0; perm < 32; perm++)
   23235              :     {
   23236              :       /* Ignore permutations which do not move anything cross-lane.  */
   23237       191785 :       if (perm < 16)
   23238              :         {
   23239              :           /* The second shuffle for e.g. V4DFmode has
   23240              :              0123 and ABCD operands.
   23241              :              Ignore AB23, as 23 is already in the second lane
   23242              :              of the first operand.  */
   23243        96126 :           if ((perm & 0xc) == (1 << 2)) continue;
   23244              :           /* And 01CD, as 01 is in the first lane of the first
   23245              :              operand.  */
   23246        72086 :           if ((perm & 3) == 0) continue;
   23247              :           /* And 4567, as then the vperm2[fi]128 doesn't change
   23248              :              anything on the original 4567 second operand.  */
   23249        54049 :           if ((perm & 0xf) == ((3 << 2) | 2)) continue;
   23250              :         }
   23251              :       else
   23252              :         {
   23253              :           /* The second shuffle for e.g. V4DFmode has
   23254              :              4567 and ABCD operands.
   23255              :              Ignore AB67, as 67 is already in the second lane
   23256              :              of the first operand.  */
   23257        95659 :           if ((perm & 0xc) == (3 << 2)) continue;
   23258              :           /* And 45CD, as 45 is in the first lane of the first
   23259              :              operand.  */
   23260        71859 :           if ((perm & 3) == 2) continue;
   23261              :           /* And 0123, as then the vperm2[fi]128 doesn't change
   23262              :              anything on the original 0123 first operand.  */
   23263        53918 :           if ((perm & 0xf) == (1 << 2)) continue;
   23264              :         }
   23265              : 
   23266       277596 :       for (i = 0; i < nelt; i++)
   23267              :         {
   23268       275777 :           j = d->perm[i] / nelt2;
   23269       510607 :           if (j == ((perm >> (2 * (i >= nelt2))) & 3))
   23270        67089 :             dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
   23271       349601 :           else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
   23272       114534 :             dsecond.perm[i] = d->perm[i] & (nelt - 1);
   23273              :           else
   23274              :             break;
   23275              :         }
   23276              : 
   23277        95973 :       if (i == nelt)
   23278              :         {
   23279         1819 :           start_sequence ();
   23280         1819 :           ok = expand_vec_perm_1 (&dsecond);
   23281         1819 :           end_sequence ();
   23282              :         }
   23283              :       else
   23284              :         ok = false;
   23285              : 
   23286         1819 :       if (ok)
   23287              :         {
   23288           64 :           if (d->testing_p)
   23289              :             return true;
   23290              : 
   23291              :           /* Found a usable second shuffle.  dfirst will be
   23292              :              vperm2f128 on d->op0 and d->op1.  */
   23293           46 :           dsecond.testing_p = false;
   23294           46 :           dfirst = *d;
   23295           46 :           dfirst.target = gen_reg_rtx (d->vmode);
   23296          270 :           for (i = 0; i < nelt; i++)
   23297          448 :             dfirst.perm[i] = (i & (nelt2 - 1))
   23298          336 :                              + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
   23299              : 
   23300           46 :           canonicalize_perm (&dfirst);
   23301           46 :           ok = expand_vec_perm_1 (&dfirst);
   23302           46 :           gcc_assert (ok);
   23303              : 
   23304              :           /* And dsecond is some single insn shuffle, taking
   23305              :              d->op0 and result of vperm2f128 (if perm < 16) or
   23306              :              d->op1 and result of vperm2f128 (otherwise).  */
   23307           46 :           if (perm >= 16)
   23308           46 :             dsecond.op0 = dsecond.op1;
   23309           46 :           dsecond.op1 = dfirst.target;
   23310              : 
   23311           46 :           ok = expand_vec_perm_1 (&dsecond);
   23312           46 :           gcc_assert (ok);
   23313              : 
   23314              :           return true;
   23315              :         }
   23316              : 
   23317              :       /* For one operand, the only useful vperm2f128 permutation is 0x01
   23318              :          aka lanes swap.  */
   23319        95909 :       if (d->one_operand_p)
   23320              :         return false;
   23321              :     }
   23322              : 
   23323              :   return false;
   23324              : }
   23325              : 
   23326              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23327              :    a two vector permutation using 2 intra-lane interleave insns
   23328              :    and cross-lane shuffle for 32-byte vectors.  */
   23329              : 
   23330              : static bool
   23331        34319 : expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
   23332              : {
   23333        34319 :   unsigned i, nelt;
   23334        34319 :   rtx (*gen) (rtx, rtx, rtx);
   23335              : 
   23336        34319 :   if (d->one_operand_p)
   23337              :     return false;
   23338        33087 :   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
   23339              :     ;
   23340        24880 :   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
   23341              :     ;
   23342              :   else
   23343              :     return false;
   23344              : 
   23345         9717 :   nelt = d->nelt;
   23346         9717 :   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
   23347              :     return false;
   23348         9877 :   for (i = 0; i < nelt; i += 2)
   23349         9521 :     if (d->perm[i] != d->perm[0] + i / 2
   23350         8648 :         || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
   23351              :       return false;
   23352              : 
   23353          356 :   if (d->testing_p)
   23354              :     return true;
   23355              : 
   23356           56 :   switch (d->vmode)
   23357              :     {
   23358           32 :     case E_V32QImode:
   23359           32 :       if (d->perm[0])
   23360              :         gen = gen_vec_interleave_highv32qi;
   23361              :       else
   23362           16 :         gen = gen_vec_interleave_lowv32qi;
   23363              :       break;
   23364           18 :     case E_V16HImode:
   23365           18 :       if (d->perm[0])
   23366              :         gen = gen_vec_interleave_highv16hi;
   23367              :       else
   23368            9 :         gen = gen_vec_interleave_lowv16hi;
   23369              :       break;
   23370            0 :     case E_V8SImode:
   23371            0 :       if (d->perm[0])
   23372              :         gen = gen_vec_interleave_highv8si;
   23373              :       else
   23374            0 :         gen = gen_vec_interleave_lowv8si;
   23375              :       break;
   23376            4 :     case E_V4DImode:
   23377            4 :       if (d->perm[0])
   23378              :         gen = gen_vec_interleave_highv4di;
   23379              :       else
   23380            2 :         gen = gen_vec_interleave_lowv4di;
   23381              :       break;
   23382            2 :     case E_V8SFmode:
   23383            2 :       if (d->perm[0])
   23384              :         gen = gen_vec_interleave_highv8sf;
   23385              :       else
   23386            1 :         gen = gen_vec_interleave_lowv8sf;
   23387              :       break;
   23388            0 :     case E_V4DFmode:
   23389            0 :       if (d->perm[0])
   23390              :         gen = gen_vec_interleave_highv4df;
   23391              :       else
   23392            0 :         gen = gen_vec_interleave_lowv4df;
   23393              :       break;
   23394            0 :     default:
   23395            0 :       gcc_unreachable ();
   23396              :     }
   23397              : 
   23398           56 :   emit_insn (gen (d->target, d->op0, d->op1));
   23399           56 :   return true;
   23400              : }
   23401              : 
   23402              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23403              :    a single vector permutation using a single intra-lane vector
   23404              :    permutation, vperm2f128 swapping the lanes and vblend* insn blending
   23405              :    the non-swapped and swapped vectors together.  */
   23406              : 
   23407              : static bool
   23408        27132 : expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
   23409              : {
   23410        27132 :   struct expand_vec_perm_d dfirst, dsecond;
   23411        27132 :   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
   23412        27132 :   rtx_insn *seq;
   23413        27132 :   bool ok;
   23414        27132 :   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
   23415              : 
   23416        27132 :   if (!TARGET_AVX
   23417         3508 :       || TARGET_AVX2
   23418         2030 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
   23419         1846 :       || !d->one_operand_p)
   23420              :     return false;
   23421              : 
   23422            0 :   dfirst = *d;
   23423            0 :   for (i = 0; i < nelt; i++)
   23424            0 :     dfirst.perm[i] = 0xff;
   23425            0 :   for (i = 0, msk = 0; i < nelt; i++)
   23426              :     {
   23427            0 :       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
   23428            0 :       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
   23429              :         return false;
   23430            0 :       dfirst.perm[j] = d->perm[i];
   23431            0 :       if (j != i)
   23432            0 :         msk |= (1 << i);
   23433              :     }
   23434            0 :   for (i = 0; i < nelt; i++)
   23435            0 :     if (dfirst.perm[i] == 0xff)
   23436            0 :       dfirst.perm[i] = i;
   23437              : 
   23438            0 :   if (!d->testing_p)
   23439            0 :     dfirst.target = gen_reg_rtx (dfirst.vmode);
   23440              : 
   23441            0 :   start_sequence ();
   23442            0 :   ok = expand_vec_perm_1 (&dfirst);
   23443            0 :   seq = end_sequence ();
   23444              : 
   23445            0 :   if (!ok)
   23446              :     return false;
   23447              : 
   23448            0 :   if (d->testing_p)
   23449              :     return true;
   23450              : 
   23451            0 :   emit_insn (seq);
   23452              : 
   23453            0 :   dsecond = *d;
   23454            0 :   dsecond.op0 = dfirst.target;
   23455            0 :   dsecond.op1 = dfirst.target;
   23456            0 :   dsecond.one_operand_p = true;
   23457            0 :   dsecond.target = gen_reg_rtx (dsecond.vmode);
   23458            0 :   for (i = 0; i < nelt; i++)
   23459            0 :     dsecond.perm[i] = i ^ nelt2;
   23460              : 
   23461            0 :   ok = expand_vec_perm_1 (&dsecond);
   23462            0 :   gcc_assert (ok);
   23463              : 
   23464            0 :   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
   23465            0 :   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
   23466            0 :   return true;
   23467              : }
   23468              : 
   23469              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23470              :    a two vector permutation using two single vector permutations and
   23471              :    {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
   23472              :    of dfirst or dsecond is identity permutation.  */
   23473              : 
   23474              : static bool
   23475       115560 : expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
   23476              : {
   23477       115560 :   unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
   23478       115560 :   struct expand_vec_perm_d dfirst, dsecond, dfinal;
   23479       115560 :   bool ident1 = true, ident2 = true;
   23480              : 
   23481       115560 :   if (d->one_operand_p)
   23482              :     return false;
   23483              : 
   23484       210294 :   if (GET_MODE_SIZE (d->vmode) == 16)
   23485              :     {
   23486        63729 :       if (!TARGET_SSE)
   23487              :         return false;
   23488        63729 :       if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
   23489              :         return false;
   23490              :     }
   23491        82836 :   else if (GET_MODE_SIZE (d->vmode) == 32)
   23492              :     {
   23493         8599 :       if (!TARGET_AVX)
   23494              :         return false;
   23495         8599 :       if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
   23496              :         return false;
   23497              :       lane = nelt2;
   23498              :     }
   23499              :   else
   23500              :     return false;
   23501              : 
   23502       238066 :   for (i = 1; i < nelt; i++)
   23503       203569 :     if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
   23504              :       return false;
   23505              : 
   23506        34497 :   dfirst = *d;
   23507        34497 :   dsecond = *d;
   23508        34497 :   dfinal = *d;
   23509        34497 :   dfirst.op1 = dfirst.op0;
   23510        34497 :   dfirst.one_operand_p = true;
   23511        34497 :   dsecond.op0 = dsecond.op1;
   23512        34497 :   dsecond.one_operand_p = true;
   23513              : 
   23514       225509 :   for (i = 0; i < nelt; i++)
   23515       191012 :     if (d->perm[i] >= nelt)
   23516              :       {
   23517        95506 :         dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
   23518        95506 :         if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
   23519        87029 :           ident2 = false;
   23520        95506 :         dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
   23521        95506 :           = d->perm[i] - nelt;
   23522              :       }
   23523              :     else
   23524              :       {
   23525        95506 :         dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
   23526        95506 :         if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
   23527        78674 :           ident1 = false;
   23528        95506 :         dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
   23529              :       }
   23530              : 
   23531        34497 :   if (two_insn && !ident1 && !ident2)
   23532              :     return false;
   23533              : 
   23534         3957 :   if (!d->testing_p)
   23535              :     {
   23536          214 :       if (!ident1)
   23537          144 :         dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
   23538          214 :       if (!ident2)
   23539          148 :         dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
   23540          214 :       if (d->perm[0] >= nelt)
   23541            0 :         std::swap (dfinal.op0, dfinal.op1);
   23542              :     }
   23543              : 
   23544         3957 :   bool ok;
   23545         3957 :   rtx_insn *seq1 = NULL, *seq2 = NULL;
   23546              : 
   23547         3957 :   if (!ident1)
   23548              :     {
   23549         2645 :       start_sequence ();
   23550         2645 :       ok = expand_vec_perm_1 (&dfirst);
   23551         2645 :       seq1 = end_sequence ();
   23552              : 
   23553         2645 :       if (!ok)
   23554              :         return false;
   23555              :     }
   23556              : 
   23557         2168 :   if (!ident2)
   23558              :     {
   23559         2074 :       start_sequence ();
   23560         2074 :       ok = expand_vec_perm_1 (&dsecond);
   23561         2074 :       seq2 = end_sequence ();
   23562              : 
   23563         2074 :       if (!ok)
   23564              :         return false;
   23565              :     }
   23566              : 
   23567          602 :   if (d->testing_p)
   23568              :     return true;
   23569              : 
   23570          680 :   for (i = 0; i < nelt; i++)
   23571              :     {
   23572          544 :       dfinal.perm[i] = i / 2;
   23573          544 :       if (i >= lane)
   23574            4 :         dfinal.perm[i] += lane / 2;
   23575          544 :       if ((i & 1) != 0)
   23576          272 :         dfinal.perm[i] += nelt;
   23577              :     }
   23578          136 :   emit_insn (seq1);
   23579          136 :   emit_insn (seq2);
   23580          136 :   ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
   23581              :                                dfinal.perm, dfinal.nelt, false);
   23582          136 :   gcc_assert (ok);
   23583              :   return true;
   23584              : }
   23585              : 
   23586              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   23587              :    the permutation using two single vector permutations and the SSE4_1 pblendv
   23588              :    instruction.  If two_insn, succeed only if one of dfirst or dsecond is
   23589              :    identity permutation.  */
   23590              : 
   23591              : static bool
   23592       114958 : expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   23593              : {
   23594       114958 :   unsigned i, nelt = d->nelt;
   23595       114958 :   struct expand_vec_perm_d dfirst, dsecond, dfinal;
   23596       114958 :   machine_mode vmode = d->vmode;
   23597       114958 :   bool ident1 = true, ident2 = true;
   23598              : 
   23599              :   /* Use the same checks as in expand_vec_perm_blend.  */
   23600       114958 :   if (d->one_operand_p)
   23601              :     return false;
   23602       109075 :   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   23603              :     ;
   23604       101803 :   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   23605              :     ;
   23606        96562 :   else if (TARGET_SSE4_1
   23607       105978 :            && (GET_MODE_SIZE (vmode) == 16
   23608         8190 :                || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
   23609         2482 :                || GET_MODE_SIZE (vmode) == 4))
   23610              :     ;
   23611              :   else
   23612              :     return false;
   23613              : 
   23614        16657 :   dfirst = *d;
   23615        16657 :   dsecond = *d;
   23616        16657 :   dfinal = *d;
   23617        16657 :   dfirst.op1 = dfirst.op0;
   23618        16657 :   dfirst.one_operand_p = true;
   23619        16657 :   dsecond.op0 = dsecond.op1;
   23620        16657 :   dsecond.one_operand_p = true;
   23621              : 
   23622       137649 :   for (i = 0; i < nelt; ++i)
   23623       120992 :     if (d->perm[i] >= nelt)
   23624              :       {
   23625        60832 :         dfirst.perm[i] = 0xff;
   23626        60832 :         dsecond.perm[i] = d->perm[i] - nelt;
   23627        60832 :         if (d->perm[i] != i + nelt)
   23628       120992 :           ident2 = false;
   23629              :       }
   23630              :     else
   23631              :       {
   23632        60160 :         dsecond.perm[i] = 0xff;
   23633        60160 :         dfirst.perm[i] = d->perm[i];
   23634        60160 :         if (d->perm[i] != i)
   23635       120992 :           ident1 = false;
   23636              :       }
   23637              : 
   23638        16657 :   if (two_insn && !ident1 && !ident2)
   23639              :     return false;
   23640              : 
   23641              :   /* For now.  Ideally treat 0xff as a wildcard.  */
   23642        57247 :   for (i = 0; i < nelt; ++i)
   23643        51036 :     if (dfirst.perm[i] == 0xff)
   23644              :       {
   23645        26620 :         if (GET_MODE_SIZE (vmode) == 32
   23646        26620 :             && dfirst.perm[i ^ (nelt / 2)] != 0xff)
   23647        14868 :           dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
   23648              :         else
   23649        11752 :           dfirst.perm[i] = i;
   23650              :       }
   23651              :     else
   23652              :       {
   23653        24416 :         if (GET_MODE_SIZE (vmode) == 32
   23654        24416 :             && dsecond.perm[i ^ (nelt / 2)] != 0xff)
   23655        13292 :           dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
   23656              :         else
   23657        11124 :           dsecond.perm[i] = i;
   23658              :       }
   23659              : 
   23660         6211 :   if (!d->testing_p)
   23661              :     {
   23662         2403 :       if (!ident1)
   23663         2279 :         dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
   23664         2403 :       if (!ident2)
   23665         1091 :         dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
   23666              :     }
   23667              : 
   23668         6211 :   bool ok;
   23669         6211 :   rtx_insn *seq1 = NULL, *seq2 = NULL;
   23670              : 
   23671         6211 :   if (!ident1)
   23672              :     {
   23673         5622 :       start_sequence ();
   23674         5622 :       ok = expand_vec_perm_1 (&dfirst);
   23675         5622 :       seq1 = end_sequence ();
   23676              : 
   23677         5622 :       if (!ok)
   23678              :         return false;
   23679              :     }
   23680              : 
   23681         4584 :   if (!ident2)
   23682              :     {
   23683         1489 :       start_sequence ();
   23684         1489 :       ok = expand_vec_perm_1 (&dsecond);
   23685         1489 :       seq2 = end_sequence ();
   23686              : 
   23687         1489 :       if (!ok)
   23688              :         return false;
   23689              :     }
   23690              : 
   23691         3995 :   if (d->testing_p)
   23692              :     return true;
   23693              : 
   23694        21825 :   for (i = 0; i < nelt; ++i)
   23695        19764 :     dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
   23696              : 
   23697         2061 :   emit_insn (seq1);
   23698         2061 :   emit_insn (seq2);
   23699         2061 :   ok = expand_vec_perm_blend (&dfinal);
   23700         2061 :   gcc_assert (ok);
   23701              :   return true;
   23702              : }
   23703              : 
   23704              : /* A subroutine of ix86_expand_vec_perm_const_1.
   23705              :    Implement a permutation with psrlw, psllw and por.
   23706              :    It handles case:
   23707              :    __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
   23708              :    __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
   23709              : 
   23710              : static bool
   23711        25992 : expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
   23712              : {
   23713        25992 :   unsigned i;
   23714        25992 :   rtx (*gen_shr) (rtx, rtx, rtx);
   23715        25992 :   rtx (*gen_shl) (rtx, rtx, rtx);
   23716        25992 :   rtx (*gen_or) (rtx, rtx, rtx);
   23717        25992 :   machine_mode mode = VOIDmode;
   23718              : 
   23719        25992 :   if (!TARGET_SSE2 || !d->one_operand_p)
   23720              :     return false;
   23721              : 
   23722         5185 :   switch (d->vmode)
   23723              :     {
   23724         1395 :     case E_V8QImode:
   23725         1395 :       if (!TARGET_MMX_WITH_SSE)
   23726              :         return false;
   23727              :       mode = V4HImode;
   23728              :       gen_shr = gen_lshrv4hi3;
   23729              :       gen_shl = gen_ashlv4hi3;
   23730              :       gen_or = gen_iorv4hi3;
   23731              :       break;
   23732              :     case E_V16QImode:
   23733              :       mode = V8HImode;
   23734              :       gen_shr = gen_lshrv8hi3;
   23735              :       gen_shl = gen_ashlv8hi3;
   23736              :       gen_or = gen_iorv8hi3;
   23737              :       break;
   23738              :     default: return false;
   23739              :     }
   23740              : 
   23741         3082 :   if (!rtx_equal_p (d->op0, d->op1))
   23742              :     return false;
   23743              : 
   23744        12122 :   for (i = 0; i < d->nelt; i += 2)
   23745        10684 :     if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
   23746              :       return false;
   23747              : 
   23748         1438 :   if (d->testing_p)
   23749              :     return true;
   23750              : 
   23751           26 :   rtx tmp1 = gen_reg_rtx (mode);
   23752           26 :   rtx tmp2 = gen_reg_rtx (mode);
   23753           26 :   rtx op0 = force_reg (d->vmode, d->op0);
   23754              : 
   23755           26 :   emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
   23756           26 :   emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
   23757           26 :   emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
   23758           26 :   emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
   23759           26 :   emit_insn (gen_or (tmp1, tmp1, tmp2));
   23760           26 :   emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
   23761              : 
   23762           26 :   return true;
   23763              : }
   23764              : 
   23765              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
   23766              :    permutation using two vperm2f128, followed by a vshufpd insn blending
   23767              :    the two vectors together.  */
   23768              : 
   23769              : static bool
   23770        29841 : expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
   23771              : {
   23772        29841 :   struct expand_vec_perm_d dfirst, dsecond, dthird;
   23773        29841 :   bool ok;
   23774              : 
   23775        29841 :   if (!TARGET_AVX || (d->vmode != V4DFmode))
   23776              :     return false;
   23777              : 
   23778         1277 :   if (d->testing_p)
   23779              :     return true;
   23780              : 
   23781          206 :   dfirst = *d;
   23782          206 :   dsecond = *d;
   23783          206 :   dthird = *d;
   23784              : 
   23785          206 :   dfirst.perm[0] = (d->perm[0] & ~1);
   23786          206 :   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
   23787          206 :   dfirst.perm[2] = (d->perm[2] & ~1);
   23788          206 :   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
   23789          206 :   dsecond.perm[0] = (d->perm[1] & ~1);
   23790          206 :   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
   23791          206 :   dsecond.perm[2] = (d->perm[3] & ~1);
   23792          206 :   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
   23793          206 :   dthird.perm[0] = (d->perm[0] % 2);
   23794          206 :   dthird.perm[1] = (d->perm[1] % 2) + 4;
   23795          206 :   dthird.perm[2] = (d->perm[2] % 2) + 2;
   23796          206 :   dthird.perm[3] = (d->perm[3] % 2) + 6;
   23797              : 
   23798          206 :   dfirst.target = gen_reg_rtx (dfirst.vmode);
   23799          206 :   dsecond.target = gen_reg_rtx (dsecond.vmode);
   23800          206 :   dthird.op0 = dfirst.target;
   23801          206 :   dthird.op1 = dsecond.target;
   23802          206 :   dthird.one_operand_p = false;
   23803              : 
   23804          206 :   canonicalize_perm (&dfirst);
   23805          206 :   canonicalize_perm (&dsecond);
   23806              : 
   23807          206 :   ok = expand_vec_perm_1 (&dfirst)
   23808          206 :        && expand_vec_perm_1 (&dsecond)
   23809          412 :        && expand_vec_perm_1 (&dthird);
   23810              : 
   23811            0 :   gcc_assert (ok);
   23812              : 
   23813              :   return true;
   23814              : }
   23815              : 
   23816              : static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
   23817              : 
   23818              : /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   23819              :    a two vector permutation using two intra-lane vector
   23820              :    permutations, vperm2f128 swapping the lanes and vblend* insn blending
   23821              :    the non-swapped and swapped vectors together.  */
   23822              : 
   23823              : static bool
   23824        15752 : expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
   23825              : {
   23826        15752 :   struct expand_vec_perm_d dfirst, dsecond, dthird;
   23827        15752 :   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
   23828        15752 :   rtx_insn *seq1, *seq2;
   23829        15752 :   bool ok;
   23830        15752 :   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
   23831              : 
   23832        15752 :   if (!TARGET_AVX
   23833          990 :       || TARGET_AVX2
   23834          722 :       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
   23835          595 :       || d->one_operand_p)
   23836              :     return false;
   23837              : 
   23838          595 :   dfirst = *d;
   23839          595 :   dsecond = *d;
   23840         5355 :   for (i = 0; i < nelt; i++)
   23841              :     {
   23842         4760 :       dfirst.perm[i] = 0xff;
   23843         4760 :       dsecond.perm[i] = 0xff;
   23844              :     }
   23845         5355 :   for (i = 0, msk = 0; i < nelt; i++)
   23846              :     {
   23847         4760 :       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
   23848         4760 :       if (j == i)
   23849              :         {
   23850         3458 :           dfirst.perm[j] = d->perm[i];
   23851         5858 :           which1 |= (d->perm[i] < nelt ? 1 : 2);
   23852              :         }
   23853              :       else
   23854              :         {
   23855         1302 :           dsecond.perm[j] = d->perm[i];
   23856         1302 :           which2 |= (d->perm[i] < nelt ? 1 : 2);
   23857         1302 :           msk |= (1U << i);
   23858              :         }
   23859              :     }
   23860          595 :   if (msk == 0 || msk == (1U << nelt) - 1)
   23861              :     return false;
   23862              : 
   23863          595 :   if (!d->testing_p)
   23864              :     {
   23865           40 :       dfirst.target = gen_reg_rtx (dfirst.vmode);
   23866           40 :       dsecond.target = gen_reg_rtx (dsecond.vmode);
   23867              :     }
   23868              : 
   23869         5355 :   for (i = 0; i < nelt; i++)
   23870              :     {
   23871         4760 :       if (dfirst.perm[i] == 0xff)
   23872         1302 :         dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
   23873         4760 :       if (dsecond.perm[i] == 0xff)
   23874         3458 :         dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
   23875              :     }
   23876          595 :   canonicalize_perm (&dfirst);
   23877          595 :   start_sequence ();
   23878          595 :   ok = ix86_expand_vec_perm_const_1 (&dfirst);
   23879          595 :   seq1 = end_sequence ();
   23880              : 
   23881          595 :   if (!ok)
   23882              :     return false;
   23883              : 
   23884          595 :   canonicalize_perm (&dsecond);
   23885          595 :   start_sequence ();
   23886          595 :   ok = ix86_expand_vec_perm_const_1 (&dsecond);
   23887          595 :   seq2 = end_sequence ();
   23888              : 
   23889          595 :   if (!ok)
   23890              :     return false;
   23891              : 
   23892          595 :   if (d->testing_p)
   23893              :     return true;
   23894              : 
   23895           40 :   emit_insn (seq1);
   23896           40 :   emit_insn (seq2);
   23897              : 
   23898           40 :   dthird = *d;
   23899           40 :   dthird.op0 = dsecond.target;
   23900           40 :   dthird.op1 = dsecond.target;
   23901           40 :   dthird.one_operand_p = true;
   23902           40 :   dthird.target = gen_reg_rtx (dthird.vmode);
   23903          360 :   for (i = 0; i < nelt; i++)
   23904          320 :     dthird.perm[i] = i ^ nelt2;
   23905              : 
   23906           40 :   ok = expand_vec_perm_1 (&dthird);
   23907           40 :   gcc_assert (ok);
   23908              : 
   23909           40 :   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
   23910           40 :   emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
   23911           40 :   return true;
   23912              : }
   23913              : 
   23914              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
   23915              :    permutation with two pshufb insns and an ior.  We should have already
   23916              :    failed all two instruction sequences.  */
   23917              : 
   23918              : static bool
   23919        28585 : expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
   23920              : {
   23921        28585 :   rtx rperm[2][16], vperm, l, h, op, m128;
   23922        28585 :   unsigned int i, nelt, eltsz;
   23923        28585 :   machine_mode mode;
   23924        28585 :   rtx (*gen) (rtx, rtx, rtx);
   23925              : 
   23926        33303 :   if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
   23927         9346 :                         && GET_MODE_SIZE (d->vmode) != 8
   23928         9306 :                         && GET_MODE_SIZE (d->vmode) != 4))
   23929              :     return false;
   23930         1072 :   gcc_assert (!d->one_operand_p);
   23931              : 
   23932         1072 :   if (d->testing_p)
   23933              :     return true;
   23934              : 
   23935          202 :   switch (GET_MODE_SIZE (d->vmode))
   23936              :     {
   23937              :     case 4:
   23938              :       mode = V4QImode;
   23939              :       gen = gen_mmx_pshufbv4qi3;
   23940              :       break;
   23941           20 :     case 8:
   23942           20 :       mode = V8QImode;
   23943           20 :       gen = gen_mmx_pshufbv8qi3;
   23944           20 :       break;
   23945           45 :     case 16:
   23946           45 :       mode = V16QImode;
   23947           45 :       gen = gen_ssse3_pshufbv16qi3;
   23948           45 :       break;
   23949            0 :     default:
   23950            0 :       gcc_unreachable ();
   23951              :     }
   23952              : 
   23953          101 :   nelt = d->nelt;
   23954          101 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   23955              : 
   23956              :   /* Generate two permutation masks.  If the required element is within
   23957              :      the given vector it is shuffled into the proper lane.  If the required
   23958              :      element is in the other vector, force a zero into the lane by setting
   23959              :      bit 7 in the permutation mask.  */
   23960          101 :   m128 = GEN_INT (-128);
   23961         1029 :   for (i = 0; i < nelt; ++i)
   23962              :     {
   23963          928 :       unsigned j, k, e = d->perm[i];
   23964          928 :       unsigned which = (e >= nelt);
   23965          928 :       if (e >= nelt)
   23966          480 :         e -= nelt;
   23967              : 
   23968         1952 :       for (j = 0; j < eltsz; ++j)
   23969              :         {
   23970         1024 :           rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
   23971         1024 :           rperm[1-which][i*eltsz + j] = m128;
   23972              :         }
   23973              : 
   23974         9024 :       for (k = i*eltsz + j; k < 16; ++k)
   23975         8096 :         rperm[0][k] = rperm[1][k] = m128;
   23976              :     }
   23977              : 
   23978          101 :   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
   23979          101 :   vperm = force_reg (V16QImode, vperm);
   23980              : 
   23981          101 :   l = gen_reg_rtx (mode);
   23982          101 :   op = gen_lowpart (mode, d->op0);
   23983          101 :   emit_insn (gen (l, op, vperm));
   23984              : 
   23985          101 :   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
   23986          101 :   vperm = force_reg (V16QImode, vperm);
   23987              : 
   23988          101 :   h = gen_reg_rtx (mode);
   23989          101 :   op = gen_lowpart (mode, d->op1);
   23990          101 :   emit_insn (gen (h, op, vperm));
   23991              : 
   23992          101 :   op = d->target;
   23993          101 :   if (d->vmode != mode)
   23994           22 :     op = gen_reg_rtx (mode);
   23995          101 :   ix86_emit_vec_binop (IOR, mode, op, l, h);
   23996          101 :   if (op != d->target)
   23997           22 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   23998              : 
   23999              :   return true;
   24000              : }
   24001              : 
   24002              : /* Implement arbitrary permutation of one V32QImode and V16QImode operand
   24003              :    with two vpshufb insns, vpermq and vpor.  We should have already failed
   24004              :    all two or three instruction sequences.  */
   24005              : 
   24006              : static bool
   24007        23525 : expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
   24008              : {
   24009        23525 :   rtx rperm[2][32], vperm, l, h, hp, op, m128;
   24010        23525 :   unsigned int i, nelt, eltsz;
   24011              : 
   24012        23525 :   if (!TARGET_AVX2
   24013          401 :       || !d->one_operand_p
   24014          172 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   24015              :     return false;
   24016              : 
   24017            7 :   if (d->testing_p)
   24018              :     return true;
   24019              : 
   24020            7 :   nelt = d->nelt;
   24021            7 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   24022              : 
   24023              :   /* Generate two permutation masks.  If the required element is within
   24024              :      the same lane, it is shuffled in.  If the required element from the
   24025              :      other lane, force a zero by setting bit 7 in the permutation mask.
   24026              :      In the other mask the mask has non-negative elements if element
   24027              :      is requested from the other lane, but also moved to the other lane,
   24028              :      so that the result of vpshufb can have the two V2TImode halves
   24029              :      swapped.  */
   24030            7 :   m128 = GEN_INT (-128);
   24031          199 :   for (i = 0; i < nelt; ++i)
   24032              :     {
   24033          192 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   24034          192 :       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
   24035              : 
   24036          416 :       for (j = 0; j < eltsz; ++j)
   24037              :         {
   24038          224 :           rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
   24039          224 :           rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
   24040              :         }
   24041              :     }
   24042              : 
   24043            7 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
   24044            7 :   vperm = force_reg (V32QImode, vperm);
   24045              : 
   24046            7 :   h = gen_reg_rtx (V32QImode);
   24047            7 :   op = gen_lowpart (V32QImode, d->op0);
   24048            7 :   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
   24049              : 
   24050              :   /* Swap the 128-byte lanes of h into hp.  */
   24051            7 :   hp = gen_reg_rtx (V4DImode);
   24052            7 :   op = gen_lowpart (V4DImode, h);
   24053            7 :   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
   24054              :                                   const1_rtx));
   24055              : 
   24056            7 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
   24057            7 :   vperm = force_reg (V32QImode, vperm);
   24058              : 
   24059            7 :   l = gen_reg_rtx (V32QImode);
   24060            7 :   op = gen_lowpart (V32QImode, d->op0);
   24061            7 :   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
   24062              : 
   24063            7 :   op = d->target;
   24064            7 :   if (d->vmode != V32QImode)
   24065            2 :     op = gen_reg_rtx (V32QImode);
   24066            7 :   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
   24067            7 :   if (op != d->target)
   24068            2 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   24069              : 
   24070              :   return true;
   24071              : }
   24072              : 
   24073              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24074              :    and extract-odd permutations of two V32QImode and V16QImode operand
   24075              :    with two vpshufb insns, vpor and vpermq.  We should have already
   24076              :    failed all two or three instruction sequences.  */
   24077              : 
   24078              : static bool
   24079        23518 : expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
   24080              : {
   24081        23518 :   rtx rperm[2][32], vperm, l, h, ior, op, m128;
   24082        23518 :   unsigned int i, nelt, eltsz;
   24083              : 
   24084        23518 :   if (!TARGET_AVX2
   24085          394 :       || d->one_operand_p
   24086          229 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   24087              :     return false;
   24088              : 
   24089          112 :   for (i = 0; i < d->nelt; ++i)
   24090          112 :     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
   24091              :       return false;
   24092              : 
   24093            0 :   if (d->testing_p)
   24094              :     return true;
   24095              : 
   24096            0 :   nelt = d->nelt;
   24097            0 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   24098              : 
   24099              :   /* Generate two permutation masks.  In the first permutation mask
   24100              :      the first quarter will contain indexes for the first half
   24101              :      of the op0, the second quarter will contain bit 7 set, third quarter
   24102              :      will contain indexes for the second half of the op0 and the
   24103              :      last quarter bit 7 set.  In the second permutation mask
   24104              :      the first quarter will contain bit 7 set, the second quarter
   24105              :      indexes for the first half of the op1, the third quarter bit 7 set
   24106              :      and last quarter indexes for the second half of the op1.
   24107              :      I.e. the first mask e.g. for V32QImode extract even will be:
   24108              :      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
   24109              :      (all values masked with 0xf except for -128) and second mask
   24110              :      for extract even will be
   24111              :      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
   24112            0 :   m128 = GEN_INT (-128);
   24113            0 :   for (i = 0; i < nelt; ++i)
   24114              :     {
   24115            0 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   24116            0 :       unsigned which = d->perm[i] >= nelt;
   24117            0 :       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
   24118              : 
   24119            0 :       for (j = 0; j < eltsz; ++j)
   24120              :         {
   24121            0 :           rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
   24122            0 :           rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
   24123              :         }
   24124              :     }
   24125              : 
   24126            0 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
   24127            0 :   vperm = force_reg (V32QImode, vperm);
   24128              : 
   24129            0 :   l = gen_reg_rtx (V32QImode);
   24130            0 :   op = gen_lowpart (V32QImode, d->op0);
   24131            0 :   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
   24132              : 
   24133            0 :   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
   24134            0 :   vperm = force_reg (V32QImode, vperm);
   24135              : 
   24136            0 :   h = gen_reg_rtx (V32QImode);
   24137            0 :   op = gen_lowpart (V32QImode, d->op1);
   24138            0 :   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
   24139              : 
   24140            0 :   ior = gen_reg_rtx (V32QImode);
   24141            0 :   emit_insn (gen_iorv32qi3 (ior, l, h));
   24142              : 
   24143              :   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
   24144            0 :   op = gen_reg_rtx (V4DImode);
   24145            0 :   ior = gen_lowpart (V4DImode, ior);
   24146            0 :   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
   24147              :                                   const1_rtx, GEN_INT (3)));
   24148            0 :   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   24149              : 
   24150            0 :   return true;
   24151              : }
   24152              : 
   24153              : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
   24154              :    permutation (which is a bland) with and, andnot and or when pshufb is not available.
   24155              : 
   24156              :    It handles case:
   24157              :    __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
   24158              :    __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
   24159              : 
   24160              :    An element[i] must be chosen between op0[i] and op1[i] to satisfy the
   24161              :    requirement.
   24162              :  */
   24163              : 
   24164              : static bool
   24165        24554 : expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
   24166              : {
   24167        24554 :   rtx rperm[16], vperm;
   24168        24554 :   unsigned int i, nelt = d->nelt;
   24169              : 
   24170        24554 :   if (!TARGET_SSE2
   24171        24554 :       || d->one_operand_p
   24172        20807 :       || (d->vmode != V16QImode && d->vmode != V8HImode))
   24173              :     return false;
   24174              : 
   24175         7585 :   if (d->perm[0] != 0)
   24176              :     return false;
   24177              : 
   24178              :   /* The dest[i] must select an element between op0[i] and op1[i].  */
   24179        15483 :   for (i = 1; i < nelt; i++)
   24180        14466 :     if ((d->perm[i] % nelt) != i)
   24181              :       return false;
   24182              : 
   24183         1017 :   if (d->testing_p)
   24184              :      return true;
   24185              : 
   24186              :   /* Generates a blend mask for the operators AND and ANDNOT.  */
   24187          108 :   machine_mode inner_mode = GET_MODE_INNER (d->vmode);
   24188         1148 :   for (i = 0; i < nelt; i++)
   24189         1581 :     rperm[i] = (d->perm[i] <  nelt) ? CONSTM1_RTX (inner_mode)
   24190          541 :       : CONST0_RTX (inner_mode);
   24191              : 
   24192          108 :   vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
   24193          108 :   vperm = force_reg (d->vmode, vperm);
   24194              : 
   24195          108 :   ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
   24196              : 
   24197          108 :   return true;
   24198              : }
   24199              : 
   24200              : /* Implement permutation with pslldq + psrldq + por when pshufb is not
   24201              :    available.  */
   24202              : static bool
   24203        43543 : expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
   24204              : {
   24205        43543 :   unsigned i, nelt = d->nelt;
   24206        43543 :   unsigned start1, end1 = -1;
   24207        43543 :   machine_mode vmode = d->vmode, imode;
   24208        43543 :   int start2 = -1;
   24209        43543 :   bool clear_op0, clear_op1;
   24210        43543 :   unsigned inner_size;
   24211        43543 :   rtx op0, op1, dop1;
   24212        43543 :   rtx (*gen_vec_shr) (rtx, rtx, rtx);
   24213        43543 :   rtx (*gen_vec_shl) (rtx, rtx, rtx);
   24214              : 
   24215              :   /* pshufd can be used for V4SI/V2DI under TARGET_SSE2.  */
   24216        43543 :   if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
   24217              :     return false;
   24218              : 
   24219        13498 :   start1 = d->perm[0];
   24220        36108 :   for (i = 1; i < nelt; i++)
   24221              :     {
   24222        35417 :       if (d->perm[i] != d->perm[i-1] + 1
   24223         9326 :           || d->perm[i] == nelt)
   24224              :         {
   24225        26305 :           if (start2 == -1)
   24226              :             {
   24227        13498 :               start2 = d->perm[i];
   24228        13498 :               end1 = d->perm[i-1];
   24229              :             }
   24230              :           else
   24231              :             return false;
   24232              :         }
   24233              :     }
   24234              : 
   24235          691 :   clear_op0 = end1 != nelt - 1;
   24236          691 :   clear_op1 = start2 % nelt != 0;
   24237              :   /* pandn/pand is needed to clear upper/lower bits of op0/op1.  */
   24238          691 :   if (!pandn && (clear_op0 || clear_op1))
   24239              :     return false;
   24240              : 
   24241          467 :   if (d->testing_p)
   24242              :     return true;
   24243              : 
   24244           44 :   gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
   24245           20 :   gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
   24246           44 :   imode = GET_MODE_INNER (vmode);
   24247           44 :   inner_size = GET_MODE_BITSIZE (imode);
   24248           44 :   op0 = gen_reg_rtx (vmode);
   24249           44 :   op1 = gen_reg_rtx (vmode);
   24250              : 
   24251           44 :   if (start1)
   24252           41 :     emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
   24253              :   else
   24254            3 :     emit_move_insn (op0, d->op0);
   24255              : 
   24256           44 :   dop1 = d->op1;
   24257           44 :   if (d->one_operand_p)
   24258           24 :     dop1 = d->op0;
   24259              : 
   24260           44 :   int shl_offset = end1 - start1 + 1 - start2 % nelt;
   24261           44 :   if (shl_offset)
   24262           44 :     emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
   24263              :   else
   24264            0 :     emit_move_insn (op1, dop1);
   24265              : 
   24266              :   /* Clear lower/upper bits for op0/op1.  */
   24267           44 :   if (clear_op0 || clear_op1)
   24268              :     {
   24269              :       rtx vec[16];
   24270              :       rtx const_vec;
   24271              :       rtx clear;
   24272          339 :       for (i = 0; i != nelt; i++)
   24273              :         {
   24274          312 :           if (i < (end1 - start1 + 1))
   24275          156 :             vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
   24276              :           else
   24277          156 :             vec[i] = CONST0_RTX (imode);
   24278              :         }
   24279           27 :       const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
   24280           27 :       const_vec = validize_mem (force_const_mem (vmode, const_vec));
   24281           27 :       clear = force_reg (vmode, const_vec);
   24282              : 
   24283           27 :       if (clear_op0)
   24284           19 :         emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
   24285           27 :       if (clear_op1)
   24286           16 :         emit_move_insn (op1, gen_rtx_AND (vmode,
   24287              :                                           gen_rtx_NOT (vmode, clear),
   24288              :                                           op1));
   24289              :     }
   24290              : 
   24291           44 :   emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
   24292           44 :   return true;
   24293              : }
   24294              : 
   24295              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24296              :    and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
   24297              :    operands with two "and" and "pack" or two "shift" and "pack" insns.
   24298              :    We should have already failed all two instruction sequences.  */
   24299              : 
   24300              : static bool
   24301        45724 : expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
   24302              : {
   24303        45724 :   rtx op, dop0, dop1, t;
   24304        45724 :   unsigned i, odd, c, s, nelt = d->nelt;
   24305        45724 :   int pblendw_i = 0;
   24306        45724 :   bool end_perm = false;
   24307        45724 :   machine_mode half_mode;
   24308        45724 :   rtx (*gen_and) (rtx, rtx, rtx);
   24309        45724 :   rtx (*gen_pack) (rtx, rtx, rtx);
   24310        45724 :   rtx (*gen_shift) (rtx, rtx, rtx);
   24311              : 
   24312        45724 :   if (d->one_operand_p)
   24313              :     return false;
   24314              : 
   24315        40496 :   switch (d->vmode)
   24316              :     {
   24317         4222 :     case E_V4HImode:
   24318              :       /* Required for "pack".  */
   24319         4222 :       if (!TARGET_SSE4_1)
   24320              :         return false;
   24321              :       c = 0xffff;
   24322              :       s = 16;
   24323              :       half_mode = V2SImode;
   24324              :       gen_and = gen_andv2si3;
   24325              :       gen_pack = gen_mmx_packusdw;
   24326              :       gen_shift = gen_lshrv2si3;
   24327              :       pblendw_i = 0x5;
   24328              :       break;
   24329         5843 :     case E_V8HImode:
   24330              :       /* Required for "pack".  */
   24331         5843 :       if (!TARGET_SSE4_1)
   24332              :         return false;
   24333              :       c = 0xffff;
   24334              :       s = 16;
   24335              :       half_mode = V4SImode;
   24336              :       gen_and = gen_andv4si3;
   24337              :       gen_pack = gen_sse4_1_packusdw;
   24338              :       gen_shift = gen_lshrv4si3;
   24339              :       pblendw_i = 0x55;
   24340              :       break;
   24341              :     case E_V8QImode:
   24342              :       /* No check as all instructions are SSE2.  */
   24343              :       c = 0xff;
   24344              :       s = 8;
   24345              :       half_mode = V4HImode;
   24346              :       gen_and = gen_andv4hi3;
   24347              :       gen_pack = gen_mmx_packuswb;
   24348              :       gen_shift = gen_lshrv4hi3;
   24349              :       break;
   24350        14113 :     case E_V16QImode:
   24351              :       /* No check as all instructions are SSE2.  */
   24352        14113 :       c = 0xff;
   24353        14113 :       s = 8;
   24354        14113 :       half_mode = V8HImode;
   24355        14113 :       gen_and = gen_andv8hi3;
   24356        14113 :       gen_pack = gen_sse2_packuswb;
   24357        14113 :       gen_shift = gen_lshrv8hi3;
   24358        14113 :       break;
   24359          435 :     case E_V16HImode:
   24360          435 :       if (!TARGET_AVX2)
   24361              :         return false;
   24362              :       c = 0xffff;
   24363              :       s = 16;
   24364              :       half_mode = V8SImode;
   24365              :       gen_and = gen_andv8si3;
   24366              :       gen_pack = gen_avx2_packusdw;
   24367              :       gen_shift = gen_lshrv8si3;
   24368              :       pblendw_i = 0x5555;
   24369              :       end_perm = true;
   24370              :       break;
   24371          509 :     case E_V32QImode:
   24372          509 :       if (!TARGET_AVX2)
   24373              :         return false;
   24374              :       c = 0xff;
   24375              :       s = 8;
   24376              :       half_mode = V16HImode;
   24377              :       gen_and = gen_andv16hi3;
   24378              :       gen_pack = gen_avx2_packuswb;
   24379              :       gen_shift = gen_lshrv16hi3;
   24380              :       end_perm = true;
   24381              :       break;
   24382              :     default:
   24383              :       /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
   24384              :          are more profitable than general shuffles.  */
   24385              :       return false;
   24386              :     }
   24387              : 
   24388              :   /* Check that permutation is even or odd.  */
   24389        19982 :   odd = d->perm[0];
   24390        19982 :   if (odd > 1)
   24391              :     return false;
   24392              : 
   24393       229021 :   for (i = 1; i < nelt; ++i)
   24394       213198 :     if (d->perm[i] != 2 * i + odd)
   24395              :       return false;
   24396              : 
   24397        15823 :   if (d->testing_p)
   24398              :     return true;
   24399              : 
   24400         5495 :   dop0 = gen_reg_rtx (half_mode);
   24401         5495 :   dop1 = gen_reg_rtx (half_mode);
   24402         5495 :   if (odd == 0)
   24403              :     {
   24404              :       /* Use pblendw since const_vector 0 should be cheaper than
   24405              :          const_vector 0xffff.  */
   24406         4774 :       if (d->vmode == V4HImode
   24407              :           || d->vmode == E_V8HImode
   24408              :           || d->vmode == E_V16HImode)
   24409              :         {
   24410          864 :           rtx dop0_t = gen_reg_rtx (d->vmode);
   24411          864 :           rtx dop1_t = gen_reg_rtx (d->vmode);
   24412          864 :           t = gen_reg_rtx (d->vmode);
   24413          864 :           emit_move_insn (t, CONST0_RTX (d->vmode));
   24414              : 
   24415          864 :           emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
   24416              :                                                      GEN_INT (pblendw_i)));
   24417          864 :           emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
   24418              :                                                      GEN_INT (pblendw_i)));
   24419              : 
   24420          864 :           emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
   24421          864 :           emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
   24422          864 :         }
   24423              :       else
   24424              :         {
   24425         3910 :           t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
   24426         3910 :           t = force_reg (half_mode, t);
   24427         3910 :           emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
   24428         3910 :           emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
   24429              :         }
   24430              :     }
   24431              :   else
   24432              :     {
   24433         1442 :       emit_insn (gen_shift (dop0,
   24434          721 :                             gen_lowpart (half_mode, d->op0),
   24435              :                             GEN_INT (s)));
   24436         1442 :       emit_insn (gen_shift (dop1,
   24437          721 :                             gen_lowpart (half_mode, d->op1),
   24438              :                             GEN_INT (s)));
   24439              :     }
   24440              :   /* In AVX2 for 256 bit case we need to permute pack result.  */
   24441         5495 :   if (TARGET_AVX2 && end_perm)
   24442              :     {
   24443          411 :       op = gen_reg_rtx (d->vmode);
   24444          411 :       t = gen_reg_rtx (V4DImode);
   24445          411 :       emit_insn (gen_pack (op, dop0, dop1));
   24446          822 :       emit_insn (gen_avx2_permv4di_1 (t,
   24447          411 :                                       gen_lowpart (V4DImode, op),
   24448              :                                       const0_rtx,
   24449              :                                       const2_rtx,
   24450              :                                       const1_rtx,
   24451              :                                       GEN_INT (3)));
   24452          411 :       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
   24453              :     }
   24454              :   else
   24455         5084 :     emit_insn (gen_pack (d->target, dop0, dop1));
   24456              : 
   24457              :   return true;
   24458              : }
   24459              : 
   24460              : /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   24461              :    and extract-odd permutations of two V64QI operands
   24462              :    with two "shifts", two "truncs" and one "concat" insns for "odd"
   24463              :    and two "truncs" and one concat insn for "even."
   24464              :    Have already failed all two instruction sequences.  */
   24465              : 
   24466              : static bool
   24467        23573 : expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
   24468              : {
   24469        23573 :   rtx t1, t2, t3, t4;
   24470        23573 :   unsigned i, odd, nelt = d->nelt;
   24471              : 
   24472        23573 :   if (!TARGET_AVX512BW
   24473          106 :       || d->one_operand_p
   24474           70 :       || d->vmode != V64QImode)
   24475              :     return false;
   24476              : 
   24477              :   /* Check that permutation is even or odd.  */
   24478           70 :   odd = d->perm[0];
   24479           70 :   if (odd > 1)
   24480              :     return false;
   24481              : 
   24482         2422 :   for (i = 1; i < nelt; ++i)
   24483         2388 :     if (d->perm[i] != 2 * i + odd)
   24484              :       return false;
   24485              : 
   24486           34 :   if (d->testing_p)
   24487              :     return true;
   24488              : 
   24489              : 
   24490           34 :   if (odd)
   24491              :     {
   24492            5 :       t1 = gen_reg_rtx (V32HImode);
   24493            5 :       t2 = gen_reg_rtx (V32HImode);
   24494           10 :       emit_insn (gen_lshrv32hi3 (t1,
   24495            5 :                                  gen_lowpart (V32HImode, d->op0),
   24496              :                                  GEN_INT (8)));
   24497           10 :       emit_insn (gen_lshrv32hi3 (t2,
   24498            5 :                                  gen_lowpart (V32HImode, d->op1),
   24499              :                                  GEN_INT (8)));
   24500              :     }
   24501              :   else
   24502              :     {
   24503           29 :       t1 = gen_lowpart (V32HImode, d->op0);
   24504           29 :       t2 = gen_lowpart (V32HImode, d->op1);
   24505              :     }
   24506              : 
   24507           34 :   t3 = gen_reg_rtx (V32QImode);
   24508           34 :   t4 = gen_reg_rtx (V32QImode);
   24509           34 :   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
   24510           34 :   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
   24511           34 :   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
   24512              : 
   24513           34 :   return true;
   24514              : }
   24515              : 
   24516              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
   24517              :    and extract-odd permutations.  */
   24518              : 
   24519              : static bool
   24520        12561 : expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
   24521              : {
   24522        12561 :   rtx t1, t2, t3, t4, t5;
   24523              : 
   24524        12561 :   switch (d->vmode)
   24525              :     {
   24526           19 :     case E_V4DFmode:
   24527           19 :       if (d->testing_p)
   24528              :         break;
   24529            1 :       t1 = gen_reg_rtx (V4DFmode);
   24530            1 :       t2 = gen_reg_rtx (V4DFmode);
   24531              : 
   24532              :       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
   24533            1 :       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
   24534            1 :       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
   24535              : 
   24536              :       /* Now an unpck[lh]pd will produce the result required.  */
   24537            1 :       if (odd)
   24538            0 :         t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
   24539              :       else
   24540            1 :         t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
   24541            1 :       emit_insn (t3);
   24542            1 :       break;
   24543              : 
   24544         1214 :     case E_V8SFmode:
   24545         1214 :       {
   24546         1214 :         int mask = odd ? 0xdd : 0x88;
   24547              : 
   24548         1214 :         if (d->testing_p)
   24549              :           break;
   24550          186 :         t1 = gen_reg_rtx (V8SFmode);
   24551          186 :         t2 = gen_reg_rtx (V8SFmode);
   24552          186 :         t3 = gen_reg_rtx (V8SFmode);
   24553              : 
   24554              :         /* Shuffle within the 128-bit lanes to produce:
   24555              :            { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
   24556          186 :         emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
   24557              :                                       GEN_INT (mask)));
   24558              : 
   24559              :         /* Shuffle the lanes around to produce:
   24560              :            { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
   24561          186 :         emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
   24562              :                                             GEN_INT (0x3)));
   24563              : 
   24564              :         /* Shuffle within the 128-bit lanes to produce:
   24565              :            { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
   24566          186 :         emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
   24567              : 
   24568              :         /* Shuffle within the 128-bit lanes to produce:
   24569              :            { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
   24570          186 :         emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
   24571              : 
   24572              :         /* Shuffle the lanes around to produce:
   24573              :            { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
   24574          186 :         emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
   24575              :                                             GEN_INT (0x20)));
   24576              :       }
   24577          186 :       break;
   24578              : 
   24579            0 :     case E_V2DFmode:
   24580            0 :     case E_V4SFmode:
   24581            0 :     case E_V2DImode:
   24582            0 :     case E_V2SImode:
   24583            0 :     case E_V4SImode:
   24584            0 :     case E_V2HImode:
   24585              :       /* These are always directly implementable by expand_vec_perm_1.  */
   24586            0 :       gcc_unreachable ();
   24587              : 
   24588            0 :     case E_V2SFmode:
   24589            0 :       gcc_assert (TARGET_MMX_WITH_SSE);
   24590              :       /* We have no suitable instructions.  */
   24591            0 :       if (d->testing_p)
   24592              :         return false;
   24593              :       break;
   24594              : 
   24595         1412 :     case E_V4QImode:
   24596         1412 :       if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24597            0 :         return expand_vec_perm_pshufb2 (d);
   24598              :       else
   24599              :         {
   24600         1412 :           if (d->testing_p)
   24601              :             break;
   24602              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24603              :              with interleave. */
   24604          178 :           t1 = gen_reg_rtx (V4QImode);
   24605          178 :           emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
   24606          178 :           emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
   24607          178 :           if (odd)
   24608           41 :             t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
   24609              :           else
   24610          137 :             t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
   24611          178 :           emit_insn (t2);
   24612              :         }
   24613          178 :       break;
   24614              : 
   24615         1320 :     case E_V4HImode:
   24616         1320 :       if (TARGET_SSE4_1)
   24617           90 :         return expand_vec_perm_even_odd_pack (d);
   24618         1230 :       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24619           20 :         return expand_vec_perm_pshufb2 (d);
   24620              :       else
   24621              :         {
   24622         1210 :           if (d->testing_p)
   24623              :             break;
   24624              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24625              :              with interleave. */
   24626          496 :           t1 = gen_reg_rtx (V4HImode);
   24627          496 :           emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
   24628          496 :           emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
   24629          496 :           if (odd)
   24630            8 :             t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
   24631              :           else
   24632          488 :             t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
   24633          496 :           emit_insn (t2);
   24634              :         }
   24635          496 :       break;
   24636              : 
   24637         6568 :     case E_V8HImode:
   24638         6568 :       if (TARGET_SSE4_1)
   24639          439 :         return expand_vec_perm_even_odd_pack (d);
   24640         6129 :       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   24641            1 :         return expand_vec_perm_pshufb2 (d);
   24642              :       else
   24643              :         {
   24644         6128 :           if (d->testing_p)
   24645              :             break;
   24646              :           /* We need 2*log2(N)-1 operations to achieve odd/even
   24647              :              with interleave. */
   24648         2699 :           t1 = gen_reg_rtx (V8HImode);
   24649         2699 :           t2 = gen_reg_rtx (V8HImode);
   24650         2699 :           emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
   24651         2699 :           emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
   24652         2699 :           emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
   24653         2699 :           emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
   24654         2699 :           if (odd)
   24655           92 :             t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
   24656              :           else
   24657         2607 :             t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
   24658         2699 :           emit_insn (t3);
   24659              :         }
   24660         2699 :       break;
   24661              : 
   24662         1341 :     case E_V8QImode:
   24663         1341 :     case E_V16QImode:
   24664         1341 :       return expand_vec_perm_even_odd_pack (d);
   24665              : 
   24666          456 :     case E_V16HImode:
   24667          456 :     case E_V32QImode:
   24668          456 :       return expand_vec_perm_even_odd_pack (d);
   24669              : 
   24670           36 :     case E_V64QImode:
   24671           36 :       return expand_vec_perm_even_odd_trunc (d);
   24672              : 
   24673           19 :     case E_V4DImode:
   24674           19 :       if (!TARGET_AVX2)
   24675              :         {
   24676           19 :           struct expand_vec_perm_d d_copy = *d;
   24677           19 :           d_copy.vmode = V4DFmode;
   24678           19 :           if (d->testing_p)
   24679           18 :             d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
   24680              :           else
   24681            1 :             d_copy.target = gen_reg_rtx (V4DFmode);
   24682           19 :           d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
   24683           19 :           d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
   24684           19 :           if (expand_vec_perm_even_odd_1 (&d_copy, odd))
   24685              :             {
   24686           19 :               if (!d->testing_p)
   24687            1 :                 emit_move_insn (d->target,
   24688            1 :                                 gen_lowpart (V4DImode, d_copy.target));
   24689           19 :               return true;
   24690              :             }
   24691              :           return false;
   24692              :         }
   24693              : 
   24694            0 :       if (d->testing_p)
   24695              :         break;
   24696              : 
   24697            0 :       t1 = gen_reg_rtx (V4DImode);
   24698            0 :       t2 = gen_reg_rtx (V4DImode);
   24699              : 
   24700              :       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
   24701            0 :       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
   24702            0 :       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
   24703              : 
   24704              :       /* Now an vpunpck[lh]qdq will produce the result required.  */
   24705            0 :       if (odd)
   24706            0 :         t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
   24707              :       else
   24708            0 :         t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
   24709            0 :       emit_insn (t3);
   24710            0 :       break;
   24711              : 
   24712          176 :     case E_V8SImode:
   24713          176 :       if (!TARGET_AVX2)
   24714              :         {
   24715           38 :           struct expand_vec_perm_d d_copy = *d;
   24716           38 :           d_copy.vmode = V8SFmode;
   24717           38 :           if (d->testing_p)
   24718           38 :             d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
   24719              :           else
   24720            0 :             d_copy.target = gen_reg_rtx (V8SFmode);
   24721           38 :           d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
   24722           38 :           d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
   24723           38 :           if (expand_vec_perm_even_odd_1 (&d_copy, odd))
   24724              :             {
   24725           38 :               if (!d->testing_p)
   24726            0 :                 emit_move_insn (d->target,
   24727            0 :                                 gen_lowpart (V8SImode, d_copy.target));
   24728           38 :               return true;
   24729              :             }
   24730              :           return false;
   24731              :         }
   24732              : 
   24733          138 :       if (d->testing_p)
   24734              :         break;
   24735              : 
   24736          138 :       t1 = gen_reg_rtx (V8SImode);
   24737          138 :       t2 = gen_reg_rtx (V8SImode);
   24738          138 :       t3 = gen_reg_rtx (V4DImode);
   24739          138 :       t4 = gen_reg_rtx (V4DImode);
   24740          138 :       t5 = gen_reg_rtx (V4DImode);
   24741              : 
   24742              :       /* Shuffle the lanes around into
   24743              :          { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
   24744          276 :       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
   24745          138 :                                     gen_lowpart (V4DImode, d->op1),
   24746              :                                     GEN_INT (0x20)));
   24747          276 :       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
   24748          138 :                                     gen_lowpart (V4DImode, d->op1),
   24749              :                                     GEN_INT (0x31)));
   24750              : 
   24751              :       /* Swap the 2nd and 3rd position in each lane into
   24752              :          { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
   24753          138 :       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
   24754              :                                     GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
   24755          138 :       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
   24756              :                                     GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
   24757              : 
   24758              :       /* Now an vpunpck[lh]qdq will produce
   24759              :          { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
   24760          138 :       if (odd)
   24761            0 :         t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
   24762            0 :                                            gen_lowpart (V4DImode, t2));
   24763              :       else
   24764          138 :         t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
   24765          138 :                                           gen_lowpart (V4DImode, t2));
   24766          138 :       emit_insn (t3);
   24767          138 :       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
   24768          138 :       break;
   24769              : 
   24770            0 :     default:
   24771            0 :       gcc_unreachable ();
   24772              :     }
   24773              : 
   24774              :   return true;
   24775              : }
   24776              : 
   24777              : /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
   24778              :    extract-even and extract-odd permutations.  */
   24779              : 
   24780              : static bool
   24781        23446 : expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
   24782              : {
   24783        23446 :   unsigned i, odd, nelt = d->nelt;
   24784              : 
   24785        23446 :   odd = d->perm[0];
   24786        23446 :   if (odd != 0 && odd != 1)
   24787              :     return false;
   24788              : 
   24789        63629 :   for (i = 1; i < nelt; ++i)
   24790        56127 :     if (d->perm[i] != 2 * i + odd)
   24791              :       return false;
   24792              : 
   24793         7502 :   if (d->vmode == E_V32HImode
   24794           12 :       && d->testing_p
   24795           12 :       && !TARGET_AVX512BW)
   24796              :     return false;
   24797              : 
   24798         7490 :   return expand_vec_perm_even_odd_1 (d, odd);
   24799              : }
   24800              : 
   24801              : /* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
   24802              :    permutations.  We assume that expand_vec_perm_1 has already failed.  */
   24803              : 
   24804              : static bool
   24805         1004 : expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
   24806              : {
   24807         1004 :   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
   24808         1004 :   machine_mode vmode = d->vmode;
   24809         1004 :   rtx (*gen) (rtx, rtx, rtx);
   24810         1004 :   unsigned char perm2[4];
   24811         1004 :   rtx op0 = d->op0, dest;
   24812         1004 :   bool ok;
   24813              : 
   24814         1004 :   switch (vmode)
   24815              :     {
   24816            0 :     case E_V4DFmode:
   24817            0 :     case E_V8SFmode:
   24818              :       /* These are special-cased in sse.md so that we can optionally
   24819              :          use the vbroadcast instruction.  They expand to two insns
   24820              :          if the input happens to be in a register.  */
   24821            0 :       gcc_unreachable ();
   24822              : 
   24823            0 :     case E_V2DFmode:
   24824            0 :     case E_V2SFmode:
   24825            0 :     case E_V4SFmode:
   24826            0 :     case E_V2DImode:
   24827            0 :     case E_V2SImode:
   24828            0 :     case E_V4SImode:
   24829            0 :     case E_V2HImode:
   24830            0 :     case E_V4HImode:
   24831              :       /* These are always implementable using standard shuffle patterns.  */
   24832            0 :       gcc_unreachable ();
   24833              : 
   24834           16 :     case E_V4QImode:
   24835              :       /* This can be implemented via interleave and pshuflw.  */
   24836           16 :       if (d->testing_p)
   24837              :         return true;
   24838              : 
   24839            8 :       if (elt >= nelt2)
   24840              :         {
   24841            4 :           gen = gen_mmx_punpckhbw_low;
   24842            4 :           elt -= nelt2;
   24843              :         }
   24844              :       else
   24845              :         gen = gen_mmx_punpcklbw_low;
   24846              : 
   24847            8 :       dest = gen_reg_rtx (vmode);
   24848            8 :       emit_insn (gen (dest, op0, op0));
   24849            8 :       vmode = get_mode_wider_vector (vmode);
   24850            8 :       op0 = gen_lowpart (vmode, dest);
   24851              : 
   24852            8 :       memset (perm2, elt, 2);
   24853            8 :       dest = gen_reg_rtx (vmode);
   24854            8 :       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
   24855            8 :       gcc_assert (ok);
   24856              : 
   24857            8 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24858            8 :       return true;
   24859              : 
   24860            4 :     case E_V8QImode:
   24861              :       /* This can be implemented via interleave.  We save one insn by
   24862              :          stopping once we have promoted to V2SImode and then use pshufd.  */
   24863            4 :       if (d->testing_p)
   24864              :         return true;
   24865            4 :       do
   24866              :         {
   24867            4 :           if (elt >= nelt2)
   24868              :             {
   24869            1 :               gen = vmode == V8QImode ? gen_mmx_punpckhbw
   24870              :                                       : gen_mmx_punpckhwd;
   24871            1 :               elt -= nelt2;
   24872              :             }
   24873              :           else
   24874            3 :             gen = vmode == V8QImode ? gen_mmx_punpcklbw
   24875              :                                     : gen_mmx_punpcklwd;
   24876            4 :           nelt2 /= 2;
   24877              : 
   24878            4 :           dest = gen_reg_rtx (vmode);
   24879            4 :           emit_insn (gen (dest, op0, op0));
   24880            4 :           vmode = get_mode_wider_vector (vmode);
   24881            4 :           op0 = gen_lowpart (vmode, dest);
   24882              :         }
   24883            4 :       while (vmode != V2SImode);
   24884              : 
   24885            2 :       memset (perm2, elt, 2);
   24886            2 :       dest = gen_reg_rtx (vmode);
   24887            2 :       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
   24888            2 :       gcc_assert (ok);
   24889              : 
   24890            2 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24891            2 :       return true;
   24892              : 
   24893          975 :     case E_V8HImode:
   24894          975 :     case E_V16QImode:
   24895              :       /* These can be implemented via interleave.  We save one insn by
   24896              :          stopping once we have promoted to V4SImode and then use pshufd.  */
   24897          975 :       if (d->testing_p)
   24898              :         return true;
   24899         1502 :       do
   24900              :         {
   24901         1502 :           if (elt >= nelt2)
   24902              :             {
   24903           16 :               gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
   24904              :                                        : gen_vec_interleave_highv8hi;
   24905           16 :               elt -= nelt2;
   24906              :             }
   24907              :           else
   24908         1486 :             gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
   24909              :                                      : gen_vec_interleave_lowv8hi;
   24910         1502 :           nelt2 /= 2;
   24911              : 
   24912         1502 :           dest = gen_reg_rtx (vmode);
   24913         1502 :           emit_insn (gen (dest, op0, op0));
   24914         1502 :           vmode = get_mode_wider_vector (vmode);
   24915         1502 :           op0 = gen_lowpart (vmode, dest);
   24916              :         }
   24917         1502 :       while (vmode != V4SImode);
   24918              : 
   24919          911 :       memset (perm2, elt, 4);
   24920          911 :       dest = gen_reg_rtx (vmode);
   24921          911 :       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
   24922          911 :       gcc_assert (ok);
   24923              : 
   24924          911 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24925          911 :       return true;
   24926              : 
   24927            1 :     case E_V8HFmode:
   24928            1 :     case E_V8BFmode:
   24929              :       /* This can be implemented via interleave and pshufd.  */
   24930            1 :       if (d->testing_p)
   24931              :         return true;
   24932              : 
   24933            1 :       rtx (*gen_interleave) (machine_mode, rtx, rtx, rtx);
   24934            1 :       if (elt >= nelt2)
   24935              :         {
   24936            0 :           gen_interleave = gen_vec_interleave_high;
   24937            0 :           elt -= nelt2;
   24938              :         }
   24939              :       else
   24940              :         gen_interleave = gen_vec_interleave_low;
   24941            1 :       nelt2 /= 2;
   24942              : 
   24943            1 :       dest = gen_reg_rtx (vmode);
   24944            1 :       emit_insn (gen_interleave (vmode, dest, op0, op0));
   24945              : 
   24946            1 :       vmode = V4SImode;
   24947            1 :       op0 = gen_lowpart (vmode, dest);
   24948              : 
   24949            1 :       memset (perm2, elt, 4);
   24950            1 :       dest = gen_reg_rtx (vmode);
   24951            1 :       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
   24952            1 :       gcc_assert (ok);
   24953              : 
   24954            1 :       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   24955            1 :       return true;
   24956              : 
   24957            0 :     case E_V32QImode:
   24958            0 :     case E_V16HImode:
   24959            0 :     case E_V8SImode:
   24960            0 :     case E_V4DImode:
   24961              :       /* For AVX2 broadcasts of the first element vpbroadcast* or
   24962              :          vpermq should be used by expand_vec_perm_1.  */
   24963            0 :       gcc_assert (!TARGET_AVX2 || d->perm[0]);
   24964              :       return false;
   24965              : 
   24966            6 :     case E_V64QImode:
   24967            6 :       gcc_assert (!TARGET_AVX512BW || d->perm[0]);
   24968              :       return false;
   24969              : 
   24970            2 :     case E_V32HImode:
   24971            2 :       gcc_assert (!TARGET_AVX512BW);
   24972              :       return false;
   24973              : 
   24974            0 :     default:
   24975            0 :       gcc_unreachable ();
   24976              :     }
   24977              : }
   24978              : 
   24979              : /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
   24980              :    broadcast permutations.  */
   24981              : 
   24982              : static bool
   24983        90348 : expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
   24984              : {
   24985        90348 :   unsigned i, elt, nelt = d->nelt;
   24986              : 
   24987        90348 :   if (!d->one_operand_p)
   24988              :     return false;
   24989              : 
   24990         5332 :   elt = d->perm[0];
   24991         8175 :   for (i = 1; i < nelt; ++i)
   24992         8067 :     if (d->perm[i] != elt)
   24993              :       return false;
   24994              : 
   24995          108 :   return expand_vec_perm_broadcast_1 (d);
   24996              : }
   24997              : 
   24998              : /* Implement arbitrary permutations of two V64QImode operands
   24999              :    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
   25000              : static bool
   25001        23518 : expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
   25002              : {
   25003        23518 :   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
   25004              :     return false;
   25005              : 
   25006           72 :   if (d->testing_p)
   25007              :     return true;
   25008              : 
   25009           72 :   struct expand_vec_perm_d ds[2];
   25010           72 :   rtx rperm[128], vperm, target0, target1;
   25011           72 :   unsigned int i, nelt;
   25012           72 :   machine_mode vmode;
   25013              : 
   25014           72 :   nelt = d->nelt;
   25015           72 :   vmode = V64QImode;
   25016              : 
   25017          216 :   for (i = 0; i < 2; i++)
   25018              :     {
   25019          144 :       ds[i] = *d;
   25020          144 :       ds[i].vmode = V32HImode;
   25021          144 :       ds[i].nelt = 32;
   25022          144 :       ds[i].target = gen_reg_rtx (V32HImode);
   25023          144 :       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
   25024          144 :       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
   25025              :     }
   25026              : 
   25027              :   /* Prepare permutations such that the first one takes care of
   25028              :      putting the even bytes into the right positions or one higher
   25029              :      positions (ds[0]) and the second one takes care of
   25030              :      putting the odd bytes into the right positions or one below
   25031              :      (ds[1]).  */
   25032              : 
   25033         4680 :   for (i = 0; i < nelt; i++)
   25034              :     {
   25035         4608 :       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
   25036         4608 :       if (i & 1)
   25037              :         {
   25038         2304 :           rperm[i] = constm1_rtx;
   25039         2304 :           rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
   25040              :         }
   25041              :       else
   25042              :         {
   25043         2304 :           rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
   25044         2304 :           rperm[i + 64] = constm1_rtx;
   25045              :         }
   25046              :     }
   25047              : 
   25048           72 :   bool ok = expand_vec_perm_1 (&ds[0]);
   25049           72 :   gcc_assert (ok);
   25050           72 :   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
   25051              : 
   25052           72 :   ok = expand_vec_perm_1 (&ds[1]);
   25053           72 :   gcc_assert (ok);
   25054           72 :   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
   25055              : 
   25056           72 :   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
   25057           72 :   vperm = force_reg (vmode, vperm);
   25058           72 :   target0 = gen_reg_rtx (V64QImode);
   25059           72 :   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
   25060              : 
   25061           72 :   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
   25062           72 :   vperm = force_reg (vmode, vperm);
   25063           72 :   target1 = gen_reg_rtx (V64QImode);
   25064           72 :   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
   25065              : 
   25066           72 :   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
   25067           72 :   return true;
   25068              : }
   25069              : 
   25070              : /* Implement arbitrary permutation of two V32QImode and V16QImode operands
   25071              :    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
   25072              :    all the shorter instruction sequences.  */
   25073              : 
   25074              : static bool
   25075        15806 : expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
   25076              : {
   25077        15806 :   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
   25078        15806 :   unsigned int i, nelt, eltsz;
   25079        15806 :   bool used[4];
   25080              : 
   25081        15806 :   if (!TARGET_AVX2
   25082          322 :       || d->one_operand_p
   25083          193 :       || (d->vmode != V32QImode && d->vmode != V16HImode))
   25084              :     return false;
   25085              : 
   25086           54 :   if (d->testing_p)
   25087              :     return true;
   25088              : 
   25089           54 :   nelt = d->nelt;
   25090           54 :   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   25091              : 
   25092              :   /* Generate 4 permutation masks.  If the required element is within
   25093              :      the same lane, it is shuffled in.  If the required element from the
   25094              :      other lane, force a zero by setting bit 7 in the permutation mask.
   25095              :      In the other mask the mask has non-negative elements if element
   25096              :      is requested from the other lane, but also moved to the other lane,
   25097              :      so that the result of vpshufb can have the two V2TImode halves
   25098              :      swapped.  */
   25099           54 :   m128 = GEN_INT (-128);
   25100         1836 :   for (i = 0; i < 32; ++i)
   25101              :     {
   25102         1728 :       rperm[0][i] = m128;
   25103         1728 :       rperm[1][i] = m128;
   25104         1728 :       rperm[2][i] = m128;
   25105         1728 :       rperm[3][i] = m128;
   25106              :     }
   25107           54 :   used[0] = false;
   25108           54 :   used[1] = false;
   25109           54 :   used[2] = false;
   25110           54 :   used[3] = false;
   25111         1590 :   for (i = 0; i < nelt; ++i)
   25112              :     {
   25113         1536 :       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   25114         1536 :       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
   25115         2074 :       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
   25116              : 
   25117         3264 :       for (j = 0; j < eltsz; ++j)
   25118         1728 :         rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
   25119         1536 :       used[which] = true;
   25120              :     }
   25121              : 
   25122          162 :   for (i = 0; i < 2; ++i)
   25123              :     {
   25124          108 :       if (!used[2 * i + 1])
   25125              :         {
   25126           22 :           h[i] = NULL_RTX;
   25127           22 :           continue;
   25128              :         }
   25129           86 :       vperm = gen_rtx_CONST_VECTOR (V32QImode,
   25130           86 :                                     gen_rtvec_v (32, rperm[2 * i + 1]));
   25131           86 :       vperm = force_reg (V32QImode, vperm);
   25132           86 :       h[i] = gen_reg_rtx (V32QImode);
   25133           86 :       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
   25134           86 :       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
   25135              :     }
   25136              : 
   25137              :   /* Swap the 128-byte lanes of h[X].  */
   25138          162 :   for (i = 0; i < 2; ++i)
   25139              :    {
   25140          108 :      if (h[i] == NULL_RTX)
   25141           22 :        continue;
   25142           86 :      op = gen_reg_rtx (V4DImode);
   25143           86 :      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
   25144              :                                      const2_rtx, GEN_INT (3), const0_rtx,
   25145              :                                      const1_rtx));
   25146           86 :      h[i] = gen_lowpart (V32QImode, op);
   25147              :    }
   25148              : 
   25149          162 :   for (i = 0; i < 2; ++i)
   25150              :     {
   25151          108 :       if (!used[2 * i])
   25152              :         {
   25153            0 :           l[i] = NULL_RTX;
   25154            0 :           continue;
   25155              :         }
   25156          108 :       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
   25157          108 :       vperm = force_reg (V32QImode, vperm);
   25158          108 :       l[i] = gen_reg_rtx (V32QImode);
   25159          108 :       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
   25160          108 :       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
   25161              :     }
   25162              : 
   25163          162 :   for (i = 0; i < 2; ++i)
   25164              :     {
   25165          108 :       if (h[i] && l[i])
   25166              :         {
   25167           86 :           op = gen_reg_rtx (V32QImode);
   25168           86 :           emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
   25169           86 :           l[i] = op;
   25170              :         }
   25171           22 :       else if (h[i])
   25172            0 :         l[i] = h[i];
   25173              :     }
   25174              : 
   25175           54 :   gcc_assert (l[0] && l[1]);
   25176           54 :   op = d->target;
   25177           54 :   if (d->vmode != V32QImode)
   25178           12 :     op = gen_reg_rtx (V32QImode);
   25179           54 :   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
   25180           54 :   if (op != d->target)
   25181           12 :     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   25182              :   return true;
   25183              : }
   25184              : 
   25185              : /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
   25186              :    taken care of, perform the expansion in D and return true on success.  */
   25187              : 
   25188              : static bool
   25189       308371 : ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   25190              : {
   25191              :   /* Try a single instruction expansion.  */
   25192       308371 :   if (expand_vec_perm_1 (d))
   25193              :     return true;
   25194              : 
   25195              :   /* Try sequences of two instructions.  */
   25196              : 
   25197       102329 :   if (expand_vec_perm_pshuflw_pshufhw (d))
   25198              :     return true;
   25199              : 
   25200        99866 :   if (expand_vec_perm_palignr (d, false))
   25201              :     return true;
   25202              : 
   25203        96731 :   if (expand_vec_perm_interleave2 (d))
   25204              :     return true;
   25205              : 
   25206        90348 :   if (expand_vec_perm_broadcast (d))
   25207              :     return true;
   25208              : 
   25209        90248 :   if (expand_vec_perm_vpermq_perm_1 (d))
   25210              :     return true;
   25211              : 
   25212        90248 :   if (expand_vec_perm_vperm2f128 (d))
   25213              :     return true;
   25214              : 
   25215        90184 :   if (expand_vec_perm_pblendv (d))
   25216              :     return true;
   25217              : 
   25218        88428 :   if (expand_vec_perm_2perm_interleave (d, true))
   25219              :     return true;
   25220              : 
   25221        88066 :   if (expand_vec_perm_2perm_pblendv (d, true))
   25222              :     return true;
   25223              : 
   25224        84971 :   if (expand_vec_perm_shufps_shufps (d))
   25225              :     return true;
   25226              : 
   25227        48670 :   if (expand_vec_perm_punpckldq_pshuf (d))
   25228              :     return true;
   25229              : 
   25230              :   /* Try sequences of three instructions.  */
   25231              : 
   25232        43398 :   if (expand_vec_perm_even_odd_pack (d))
   25233              :     return true;
   25234              : 
   25235        29841 :   if (expand_vec_perm_2vperm2f128_vshuf (d))
   25236              :     return true;
   25237              : 
   25238        28564 :   if (expand_vec_perm_pshufb2 (d))
   25239              :     return true;
   25240              : 
   25241        27513 :   if (expand_vec_perm_pslldq_psrldq_por (d, false))
   25242              :     return true;
   25243              : 
   25244        27270 :   if (expand_vec_perm_interleave3 (d))
   25245              :     return true;
   25246              : 
   25247        27132 :   if (expand_vec_perm_vperm2f128_vblend (d))
   25248              :     return true;
   25249              : 
   25250        27132 :   if (expand_vec_perm_2perm_interleave (d, false))
   25251              :     return true;
   25252              : 
   25253        26892 :   if (expand_vec_perm_2perm_pblendv (d, false))
   25254              :     return true;
   25255              : 
   25256        25992 :   if (expand_vec_perm_psrlw_psllw_por (d))
   25257              :     return true;
   25258              : 
   25259        24554 :   if (expand_vec_perm_pand_pandn_por (d))
   25260              :     return true;
   25261              : 
   25262              :   /* Try sequences of four instructions.  */
   25263              : 
   25264        23537 :   if (expand_vec_perm_even_odd_trunc (d))
   25265              :     return true;
   25266        23525 :   if (expand_vec_perm_vpshufb2_vpermq (d))
   25267              :     return true;
   25268              : 
   25269        23518 :   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
   25270              :     return true;
   25271              : 
   25272        23518 :   if (expand_vec_perm_vpermt2_vpshub2 (d))
   25273              :     return true;
   25274              : 
   25275              :   /* ??? Look for narrow permutations whose element orderings would
   25276              :      allow the promotion to a wider mode.  */
   25277              : 
   25278              :   /* ??? Look for sequences of interleave or a wider permute that place
   25279              :      the data into the correct lanes for a half-vector shuffle like
   25280              :      pshuf[lh]w or vpermilps.  */
   25281              : 
   25282              :   /* ??? Look for sequences of interleave that produce the desired results.
   25283              :      The combinatorics of punpck[lh] get pretty ugly... */
   25284              : 
   25285        23446 :   if (expand_vec_perm_even_odd (d))
   25286              :     return true;
   25287              : 
   25288              :   /* Generate four or five instructions.  */
   25289        16030 :   if (expand_vec_perm_pslldq_psrldq_por (d, true))
   25290              :     return true;
   25291              : 
   25292              :   /* Even longer sequences.  */
   25293        15806 :   if (expand_vec_perm_vpshufb4_vpermq2 (d))
   25294              :     return true;
   25295              : 
   25296              :   /* See if we can get the same permutation in different vector integer
   25297              :      mode.  */
   25298        15752 :   struct expand_vec_perm_d nd;
   25299        15752 :   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   25300              :     {
   25301            0 :       if (!d->testing_p)
   25302            0 :         emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   25303            0 :       return true;
   25304              :     }
   25305              : 
   25306              :   /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
   25307        15752 :   if (expand_vec_perm2_vperm2f128_vblend (d))
   25308              :     return true;
   25309              : 
   25310              :   return false;
   25311              : }
   25312              : 
   25313              : /* If a permutation only uses one operand, make it clear. Returns true
   25314              :    if the permutation references both operands.  */
   25315              : 
   25316              : static bool
   25317        74642 : canonicalize_perm (struct expand_vec_perm_d *d)
   25318              : {
   25319        74642 :   int i, which, nelt = d->nelt;
   25320              : 
   25321       449508 :   for (i = which = 0; i < nelt; ++i)
   25322       509485 :     which |= (d->perm[i] < nelt ? 1 : 2);
   25323              : 
   25324        74642 :   d->one_operand_p = true;
   25325        74642 :   switch (which)
   25326              :     {
   25327            0 :     default:
   25328            0 :       gcc_unreachable();
   25329              : 
   25330        55521 :     case 3:
   25331        55521 :       if (!rtx_equal_p (d->op0, d->op1))
   25332              :         {
   25333        55470 :           d->one_operand_p = false;
   25334        55470 :           break;
   25335              :         }
   25336              :       /* The elements of PERM do not suggest that only the first operand
   25337              :          is used, but both operands are identical.  Allow easier matching
   25338              :          of the permutation by folding the permutation into the single
   25339              :          input vector.  */
   25340              :       /* FALLTHRU */
   25341              : 
   25342              :     case 2:
   25343         2913 :       for (i = 0; i < nelt; ++i)
   25344         2576 :         d->perm[i] &= nelt - 1;
   25345          337 :       d->op0 = d->op1;
   25346          337 :       break;
   25347              : 
   25348        18835 :     case 1:
   25349        18835 :       d->op1 = d->op0;
   25350        18835 :       break;
   25351              :     }
   25352              : 
   25353        74642 :   return (which == 3);
   25354              : }
   25355              : 
   25356              : /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
   25357              : 
   25358              : bool
   25359       830158 : ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   25360              :                                rtx target, rtx op0, rtx op1,
   25361              :                                const vec_perm_indices &sel)
   25362              : {
   25363       830158 :   if (vmode != op_mode)
   25364              :     return false;
   25365              : 
   25366       828293 :   struct expand_vec_perm_d d;
   25367       828293 :   unsigned char perm[MAX_VECT_LEN];
   25368       828293 :   unsigned int i, nelt, which;
   25369       828293 :   bool two_args;
   25370              : 
   25371              :   /* For HF and BF mode vector, convert it to HI using subreg.  */
   25372      2484421 :   if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
   25373              :     {
   25374          484 :       machine_mode orig_mode = vmode;
   25375          968 :       vmode = mode_for_vector (HImode,
   25376          484 :                                GET_MODE_NUNITS (vmode)).require ();
   25377          484 :       if (target)
   25378          441 :         target = lowpart_subreg (vmode, target, orig_mode);
   25379          484 :       if (op0)
   25380          441 :         op0 = lowpart_subreg (vmode, op0, orig_mode);
   25381          484 :       if (op1)
   25382          441 :         op1 = lowpart_subreg (vmode, op1, orig_mode);
   25383              :     }
   25384              : 
   25385       828293 :   d.target = target;
   25386       828293 :   d.op0 = op0;
   25387       828293 :   d.op1 = op1;
   25388              : 
   25389       828293 :   d.vmode = vmode;
   25390       828293 :   gcc_assert (VECTOR_MODE_P (d.vmode));
   25391       828293 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25392       828293 :   d.testing_p = !target;
   25393              : 
   25394       828293 :   gcc_assert (sel.length () == nelt);
   25395       828293 :   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
   25396              : 
   25397              :   /* Given sufficient ISA support we can just return true here
   25398              :      for selected vector modes.  */
   25399       828293 :   switch (d.vmode)
   25400              :     {
   25401         2047 :     case E_V16SFmode:
   25402         2047 :     case E_V16SImode:
   25403         2047 :     case E_V8DImode:
   25404         2047 :     case E_V8DFmode:
   25405         2047 :       if (!TARGET_AVX512F)
   25406              :         return false;
   25407              :       /* All implementable with a single vperm[it]2 insn.  */
   25408         2047 :       if (d.testing_p)
   25409              :         return true;
   25410              :       break;
   25411          323 :     case E_V32HImode:
   25412          323 :       if (!TARGET_AVX512F)
   25413              :         return false;
   25414          323 :       if (d.testing_p && TARGET_AVX512BW)
   25415              :         /* All implementable with a single vperm[it]2 insn.  */
   25416              :         return true;
   25417              :       break;
   25418          752 :     case E_V64QImode:
   25419          752 :       if (!TARGET_AVX512F)
   25420              :         return false;
   25421          752 :       if (d.testing_p && TARGET_AVX512BW)
   25422              :         /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
   25423              :         return true;
   25424              :       break;
   25425        12888 :     case E_V8SImode:
   25426        12888 :     case E_V8SFmode:
   25427        12888 :     case E_V4DFmode:
   25428        12888 :     case E_V4DImode:
   25429        12888 :       if (!TARGET_AVX)
   25430              :         return false;
   25431        12888 :       if (d.testing_p && TARGET_AVX512VL)
   25432              :         /* All implementable with a single vperm[it]2 insn.  */
   25433              :         return true;
   25434              :       break;
   25435          614 :     case E_V16HImode:
   25436          614 :       if (!TARGET_SSE2)
   25437              :         return false;
   25438          614 :       if (d.testing_p && TARGET_AVX2)
   25439              :         /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
   25440              :         return true;
   25441              :       break;
   25442          696 :     case E_V32QImode:
   25443          696 :       if (!TARGET_SSE2)
   25444              :         return false;
   25445          696 :       if (d.testing_p && TARGET_AVX2)
   25446              :         /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
   25447              :         return true;
   25448              :       break;
   25449        37702 :     case E_V8HImode:
   25450        37702 :     case E_V16QImode:
   25451        37702 :       if (!TARGET_SSE2)
   25452              :         return false;
   25453              :       /* Fall through.  */
   25454       238413 :     case E_V4SImode:
   25455       238413 :     case E_V4SFmode:
   25456       238413 :       if (!TARGET_SSE)
   25457              :         return false;
   25458              :       /* All implementable with a single vpperm insn.  */
   25459       238413 :       if (d.testing_p && TARGET_XOP)
   25460              :         return true;
   25461              :       /* All implementable with 2 pshufb + 1 ior.  */
   25462       238307 :       if (d.testing_p && TARGET_SSSE3)
   25463              :         return true;
   25464              :       break;
   25465       138561 :     case E_V2SFmode:
   25466       138561 :     case E_V2SImode:
   25467       138561 :     case E_V4HImode:
   25468       138561 :     case E_V8QImode:
   25469       138561 :       if (!TARGET_MMX_WITH_SSE)
   25470              :         return false;
   25471              :       break;
   25472        25533 :     case E_V2HImode:
   25473        25533 :       if (!TARGET_SSE2)
   25474              :         return false;
   25475              :       /* All implementable with *punpckwd.  */
   25476        25533 :       if (d.testing_p)
   25477              :         return true;
   25478              :       break;
   25479        10804 :     case E_V4QImode:
   25480        10804 :       if (!TARGET_SSE2)
   25481              :         return false;
   25482              :       break;
   25483       395748 :     case E_V2DImode:
   25484       395748 :     case E_V2DFmode:
   25485       395748 :       if (!TARGET_SSE)
   25486              :         return false;
   25487              :       /* All implementable with shufpd or unpck[lh]pd.  */
   25488       395748 :       if (d.testing_p)
   25489              :         return true;
   25490              :       break;
   25491              :     default:
   25492              :       return false;
   25493              :     }
   25494              : 
   25495      2233858 :   for (i = which = 0; i < nelt; ++i)
   25496              :     {
   25497      1826984 :       unsigned char e = sel[i];
   25498      1826984 :       gcc_assert (e < 2 * nelt);
   25499      1826984 :       d.perm[i] = e;
   25500      1826984 :       perm[i] = e;
   25501      2476040 :       which |= (e < nelt ? 1 : 2);
   25502              :     }
   25503              : 
   25504       406874 :   if (d.testing_p)
   25505              :     {
   25506              :       /* For all elements from second vector, fold the elements to first.  */
   25507       333880 :       if (which == 2)
   25508         1345 :         for (i = 0; i < nelt; ++i)
   25509         1240 :           d.perm[i] -= nelt;
   25510              : 
   25511              :       /* Check whether the mask can be applied to the vector type.  */
   25512       333880 :       d.one_operand_p = (which != 3);
   25513              : 
   25514              :       /* Implementable with shufps, pshufd or pshuflw.  */
   25515       333880 :       if (d.one_operand_p
   25516              :           && (d.vmode == V4SFmode || d.vmode == V2SFmode
   25517              :               || d.vmode == V4SImode || d.vmode == V2SImode
   25518              :               || d.vmode == V4HImode || d.vmode == V2HImode))
   25519              :         return true;
   25520              : 
   25521              :       /* Otherwise we have to go through the motions and see if we can
   25522              :          figure out how to generate the requested permutation.  */
   25523       230890 :       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
   25524       230890 :       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
   25525       230890 :       if (!d.one_operand_p)
   25526       216989 :         d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
   25527              : 
   25528       230890 :       start_sequence ();
   25529       230890 :       bool ret = ix86_expand_vec_perm_const_1 (&d);
   25530       230890 :       end_sequence ();
   25531              : 
   25532       230890 :       return ret;
   25533              :     }
   25534              : 
   25535        72994 :   two_args = canonicalize_perm (&d);
   25536              : 
   25537              :   /* If one of the operands is a zero vector, try to match pmovzx.  */
   25538        72994 :   if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
   25539              :     {
   25540          573 :       struct expand_vec_perm_d dzero = d;
   25541          573 :       if (d.op0 == CONST0_RTX (vmode))
   25542              :         {
   25543          387 :           d.op1 = dzero.op1 = force_reg (vmode, d.op1);
   25544          387 :           std::swap (dzero.op0, dzero.op1);
   25545         7527 :           for (i = 0; i < nelt; ++i)
   25546         7140 :             dzero.perm[i] ^= nelt;
   25547              :         }
   25548              :       else
   25549          186 :         d.op0 = dzero.op0 = force_reg (vmode, d.op0);
   25550              : 
   25551          573 :       if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
   25552          573 :                                   dzero.perm, nelt, dzero.testing_p))
   25553          116 :         return true;
   25554              :     }
   25555              : 
   25556              :   /* Force operands into registers.  */
   25557        72878 :   rtx nop0 = force_reg (vmode, d.op0);
   25558        72878 :   if (d.op0 == d.op1)
   25559        18675 :     d.op1 = nop0;
   25560        72878 :   d.op0 = nop0;
   25561        72878 :   d.op1 = force_reg (vmode, d.op1);
   25562              : 
   25563        72878 :   if (ix86_expand_vec_perm_const_1 (&d))
   25564              :     return true;
   25565              : 
   25566              :   /* If the selector says both arguments are needed, but the operands are the
   25567              :      same, the above tried to expand with one_operand_p and flattened selector.
   25568              :      If that didn't work, retry without one_operand_p; we succeeded with that
   25569              :      during testing.  */
   25570           22 :   if (two_args && d.one_operand_p)
   25571              :     {
   25572           22 :       d.one_operand_p = false;
   25573           22 :       memcpy (d.perm, perm, sizeof (perm));
   25574           22 :       return ix86_expand_vec_perm_const_1 (&d);
   25575              :     }
   25576              : 
   25577              :   return false;
   25578              : }
   25579              : 
   25580              : void
   25581         8190 : ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
   25582              : {
   25583         8190 :   struct expand_vec_perm_d d;
   25584         8190 :   unsigned i, nelt;
   25585              : 
   25586         8190 :   d.target = targ;
   25587         8190 :   d.op0 = op0;
   25588         8190 :   d.op1 = op1;
   25589         8190 :   d.vmode = GET_MODE (targ);
   25590         8190 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25591         8190 :   d.one_operand_p = false;
   25592         8190 :   d.testing_p = false;
   25593              : 
   25594        77926 :   for (i = 0; i < nelt; ++i)
   25595        69736 :     d.perm[i] = i * 2 + odd;
   25596              : 
   25597              :   /* We'll either be able to implement the permutation directly...  */
   25598         8190 :   if (expand_vec_perm_1 (&d))
   25599         3176 :     return;
   25600              : 
   25601              :   /* ... or we use the special-case patterns.  */
   25602         5014 :   expand_vec_perm_even_odd_1 (&d, odd);
   25603              : }
   25604              : 
   25605              : static void
   25606          924 : ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   25607              : {
   25608          924 :   struct expand_vec_perm_d d;
   25609          924 :   unsigned i, nelt, base;
   25610          924 :   bool ok;
   25611              : 
   25612          924 :   d.target = targ;
   25613          924 :   d.op0 = op0;
   25614          924 :   d.op1 = op1;
   25615          924 :   d.vmode = GET_MODE (targ);
   25616          924 :   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   25617          924 :   d.one_operand_p = false;
   25618          924 :   d.testing_p = false;
   25619              : 
   25620          924 :   base = high_p ? nelt / 2 : 0;
   25621         3652 :   for (i = 0; i < nelt / 2; ++i)
   25622              :     {
   25623         2728 :       d.perm[i * 2] = i + base;
   25624         2728 :       d.perm[i * 2 + 1] = i + base + nelt;
   25625              :     }
   25626              : 
   25627              :   /* Note that for AVX this isn't one instruction.  */
   25628          924 :   ok = ix86_expand_vec_perm_const_1 (&d);
   25629          924 :   gcc_assert (ok);
   25630          924 : }
   25631              : 
   25632              : /* Expand a vector operation shift by constant for a V*QImode in terms of the
   25633              :    same operation on V*HImode. Return true if success. */
   25634              : static bool
   25635          380 : ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
   25636              :                                      rtx dest, rtx op1, rtx op2)
   25637              : {
   25638          380 :   machine_mode qimode, himode;
   25639          380 :   HOST_WIDE_INT and_constant, xor_constant;
   25640          380 :   HOST_WIDE_INT shift_amount;
   25641          380 :   rtx vec_const_and, vec_const_xor;
   25642          380 :   rtx tmp, op1_subreg;
   25643          380 :   rtx (*gen_shift) (rtx, rtx, rtx);
   25644          380 :   rtx (*gen_and) (rtx, rtx, rtx);
   25645          380 :   rtx (*gen_xor) (rtx, rtx, rtx);
   25646          380 :   rtx (*gen_sub) (rtx, rtx, rtx);
   25647              : 
   25648              :   /* Only optimize shift by constant.  */
   25649          380 :   if (!CONST_INT_P (op2))
   25650              :     return false;
   25651              : 
   25652          380 :   qimode = GET_MODE (dest);
   25653          380 :   shift_amount = INTVAL (op2);
   25654              :   /* Do nothing when shift amount greater equal 8.  */
   25655          380 :   if (shift_amount > 7)
   25656              :     return false;
   25657              : 
   25658          380 :   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
   25659              : 
   25660              : 
   25661          380 :   if (shift_amount == 7
   25662          380 :       && code == ASHIFTRT)
   25663              :     {
   25664           32 :       if (qimode == V16QImode
   25665           10 :           || qimode == V32QImode)
   25666              :         {
   25667           31 :           rtx zero = gen_reg_rtx (qimode);
   25668           31 :           emit_move_insn (zero, CONST0_RTX (qimode));
   25669           31 :           emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
   25670           31 :         }
   25671              :       else
   25672              :         {
   25673            1 :           gcc_assert (qimode == V64QImode);
   25674            1 :           rtx kmask = gen_reg_rtx (DImode);
   25675            1 :           emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
   25676            1 :           emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
   25677              :         }
   25678           32 :       return true;
   25679              :     }
   25680              : 
   25681              :   /* Record sign bit.  */
   25682          348 :   xor_constant = 1 << (8 - shift_amount - 1);
   25683              : 
   25684              :   /* Zero upper/lower bits shift from left/right element.  */
   25685          348 :   and_constant
   25686          348 :     = (code == ASHIFT ? 256 - (1 << shift_amount)
   25687          317 :        : (1 << (8 - shift_amount)) - 1);
   25688              : 
   25689          348 :   switch (qimode)
   25690              :     {
   25691          331 :     case V16QImode:
   25692          331 :       himode = V8HImode;
   25693          281 :       gen_shift =
   25694              :         ((code == ASHIFT)
   25695          331 :          ? gen_ashlv8hi3
   25696          313 :          : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
   25697              :       gen_and = gen_andv16qi3;
   25698              :       gen_xor = gen_xorv16qi3;
   25699              :       gen_sub = gen_subv16qi3;
   25700              :       break;
   25701            6 :     case V32QImode:
   25702            6 :       himode = V16HImode;
   25703            1 :       gen_shift =
   25704              :         ((code == ASHIFT)
   25705            6 :          ? gen_ashlv16hi3
   25706            2 :          : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
   25707              :       gen_and = gen_andv32qi3;
   25708              :       gen_xor = gen_xorv32qi3;
   25709              :       gen_sub = gen_subv32qi3;
   25710              :       break;
   25711           11 :     case V64QImode:
   25712           11 :       himode = V32HImode;
   25713            1 :       gen_shift =
   25714              :         ((code == ASHIFT)
   25715           11 :          ? gen_ashlv32hi3
   25716            2 :          : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
   25717              :       gen_and = gen_andv64qi3;
   25718              :       gen_xor = gen_xorv64qi3;
   25719              :       gen_sub = gen_subv64qi3;
   25720              :       break;
   25721            0 :     default:
   25722            0 :       gcc_unreachable ();
   25723              :     }
   25724              : 
   25725          348 :   tmp = gen_reg_rtx (himode);
   25726          348 :   vec_const_and = gen_reg_rtx (qimode);
   25727          348 :   op1_subreg = lowpart_subreg (himode, op1, qimode);
   25728              : 
   25729              :   /* For ASHIFT and LSHIFTRT, perform operation like
   25730              :      vpsllw/vpsrlw $shift_amount, %op1, %dest.
   25731              :      vpand %vec_const_and, %dest.  */
   25732          348 :   emit_insn (gen_shift (tmp, op1_subreg, op2));
   25733          348 :   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
   25734          348 :   emit_move_insn (vec_const_and,
   25735              :                   ix86_build_const_vector (qimode, true,
   25736          348 :                                            gen_int_mode (and_constant, QImode)));
   25737          348 :   emit_insn (gen_and (dest, dest, vec_const_and));
   25738              : 
   25739              :   /* For ASHIFTRT, perform extra operation like
   25740              :      vpxor %vec_const_xor, %dest, %dest
   25741              :      vpsubb %vec_const_xor, %dest, %dest  */
   25742          348 :   if (code == ASHIFTRT)
   25743              :     {
   25744           34 :       vec_const_xor = gen_reg_rtx (qimode);
   25745           34 :       emit_move_insn (vec_const_xor,
   25746              :                       ix86_build_const_vector (qimode, true,
   25747           34 :                                                gen_int_mode (xor_constant, QImode)));
   25748           34 :       emit_insn (gen_xor (dest, dest, vec_const_xor));
   25749           34 :       emit_insn (gen_sub (dest, dest, vec_const_xor));
   25750              :     }
   25751              :   return true;
   25752              : }
   25753              : 
   25754              : void
   25755         1412 : ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   25756              : {
   25757         1412 :   machine_mode qimode = GET_MODE (dest);
   25758         1412 :   rtx qop1, qop2, hop1, hop2, qdest, hdest;
   25759         1412 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   25760         1412 :   bool uns_p = code != ASHIFTRT;
   25761              : 
   25762         1412 :   switch (qimode)
   25763              :     {
   25764         1412 :     case E_V4QImode:
   25765         1412 :     case E_V8QImode:
   25766         1412 :       break;
   25767            0 :     default:
   25768            0 :       gcc_unreachable ();
   25769              :     }
   25770              : 
   25771         1412 :   qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
   25772              : 
   25773         1412 :   if (op2vec)
   25774         1310 :     qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
   25775              :   else
   25776              :     qop2 = op2;
   25777              : 
   25778         1412 :   qdest = gen_reg_rtx (V16QImode);
   25779              : 
   25780         1412 :   if (CONST_INT_P (op2)
   25781           90 :       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
   25782              :       /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb.
   25783              :          Even with SSE4.1 the alternative is better.  */
   25784           90 :       && !TARGET_SSE4_1
   25785         1466 :       && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
   25786              :     {
   25787           54 :       emit_move_insn (dest, gen_lowpart (qimode, qdest));
   25788           54 :       return;
   25789              :     }
   25790              : 
   25791         1358 :   if (CONST_INT_P (op2)
   25792           36 :       && code == ASHIFTRT
   25793           10 :       && INTVAL (op2) == 7)
   25794              :     {
   25795            3 :       rtx zero = gen_reg_rtx (qimode);
   25796            3 :       emit_move_insn (zero, CONST0_RTX (qimode));
   25797            3 :       emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
   25798            3 :       return;
   25799              :     }
   25800              : 
   25801         1355 :   switch (code)
   25802              :     {
   25803         1297 :     case MULT:
   25804         1297 :       gcc_assert (op2vec);
   25805         1297 :       if (!TARGET_SSE4_1)
   25806              :         {
   25807              :           /* Unpack data such that we've got a source byte in each low byte
   25808              :              of each word.  We don't care what goes into the high byte of
   25809              :              each word.  Rather than trying to get zero in there, most
   25810              :              convenient is to let it be a copy of the low byte.  */
   25811          244 :           hop1 = copy_to_reg (qop1);
   25812          244 :           hop2 = copy_to_reg (qop2);
   25813          244 :           emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
   25814          244 :           emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
   25815          244 :           break;
   25816              :         }
   25817              :       /* FALLTHRU */
   25818         1111 :     case ASHIFT:
   25819         1111 :     case ASHIFTRT:
   25820         1111 :     case LSHIFTRT:
   25821         1111 :       hop1 = gen_reg_rtx (V8HImode);
   25822         1111 :       ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
   25823              :       /* mult/vashr/vlshr/vashl  */
   25824         1111 :       if (op2vec)
   25825              :         {
   25826         1066 :           hop2 = gen_reg_rtx (V8HImode);
   25827         1066 :           ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
   25828              :         }
   25829              :       else
   25830              :         hop2 = qop2;
   25831              : 
   25832              :       break;
   25833            0 :     default:
   25834            0 :       gcc_unreachable ();
   25835              :     }
   25836              : 
   25837         1355 :   if (code != MULT && op2vec)
   25838              :     {
   25839              :       /* Expand vashr/vlshr/vashl.  */
   25840           13 :       hdest = gen_reg_rtx (V8HImode);
   25841           13 :       emit_insn (gen_rtx_SET (hdest,
   25842              :                               simplify_gen_binary (code, V8HImode,
   25843              :                                                    hop1, hop2)));
   25844              :     }
   25845              :   else
   25846              :     /* Expand mult/ashr/lshr/ashl.  */
   25847         1342 :     hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
   25848              :                                 NULL_RTX, 1, OPTAB_DIRECT);
   25849              : 
   25850         1355 :   if (TARGET_AVX512BW && TARGET_AVX512VL)
   25851              :     {
   25852           30 :       if (qimode == V8QImode)
   25853              :         qdest = dest;
   25854              :       else
   25855           10 :         qdest = gen_reg_rtx (V8QImode);
   25856              : 
   25857           30 :       emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
   25858              :     }
   25859              :   else
   25860              :     {
   25861         1325 :       struct expand_vec_perm_d d;
   25862         1325 :       rtx qres = gen_lowpart (V16QImode, hdest);
   25863         1325 :       bool ok;
   25864         1325 :       int i;
   25865              : 
   25866              :       /* Merge the data back into the right place.  */
   25867         1325 :       d.target = qdest;
   25868         1325 :       d.op0 = d.op1 = qres;
   25869         1325 :       d.vmode = V16QImode;
   25870         1325 :       d.nelt = 16;
   25871         1325 :       d.one_operand_p = TARGET_SSSE3;
   25872         1325 :       d.testing_p = false;
   25873              : 
   25874        22525 :       for (i = 0; i < d.nelt; ++i)
   25875        21200 :         d.perm[i] = i * 2;
   25876              : 
   25877         1325 :       ok = ix86_expand_vec_perm_const_1 (&d);
   25878         1325 :       gcc_assert (ok);
   25879              :     }
   25880              : 
   25881         1355 :   if (qdest != dest)
   25882         1335 :     emit_move_insn (dest, gen_lowpart (qimode, qdest));
   25883              : }
   25884              : 
   25885              : /* Emit instruction in 2x wider mode.  For example, optimize
   25886              :    vector MUL generation like
   25887              : 
   25888              :    vpmovzxbw ymm2, xmm0
   25889              :    vpmovzxbw ymm3, xmm1
   25890              :    vpmullw   ymm4, ymm2, ymm3
   25891              :    vpmovwb   xmm0, ymm4
   25892              : 
   25893              :    it would take less instructions than ix86_expand_vecop_qihi.
   25894              :    Return true if success.  */
   25895              : 
   25896              : static bool
   25897         1339 : ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   25898              : {
   25899         1339 :   machine_mode himode, qimode = GET_MODE (dest);
   25900         1339 :   machine_mode wqimode;
   25901         1339 :   rtx qop1, qop2, hop1, hop2, hdest;
   25902         1339 :   rtx (*gen_truncate)(rtx, rtx) = NULL;
   25903         1339 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   25904         1339 :   bool uns_p = code != ASHIFTRT;
   25905              : 
   25906              :   /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
   25907              :      generic permutation to merge the data back into the right place.  This
   25908              :      permutation results in VPERMQ, which is slow, so better fall back to
   25909              :      ix86_expand_vecop_qihi.  */
   25910         1339 :   if (!TARGET_AVX512BW
   25911          301 :       || (qimode == V16QImode && !TARGET_AVX512VL)
   25912              :       /* There are no V64HImode instructions.  */
   25913          301 :       || qimode == V64QImode)
   25914              :      return false;
   25915              : 
   25916              :   /* Do not generate ymm/zmm instructions when
   25917              :      target prefers 128/256 bit vector width.  */
   25918          267 :   if ((qimode == V16QImode && TARGET_PREFER_AVX128)
   25919          267 :       || (qimode == V32QImode && TARGET_PREFER_AVX256))
   25920              :     return false;
   25921              : 
   25922          260 :   switch (qimode)
   25923              :     {
   25924              :     case E_V16QImode:
   25925              :       himode = V16HImode;
   25926              :       gen_truncate = gen_truncv16hiv16qi2;
   25927              :       break;
   25928           17 :     case E_V32QImode:
   25929           17 :       himode = V32HImode;
   25930           17 :       gen_truncate = gen_truncv32hiv32qi2;
   25931           17 :       break;
   25932            0 :     default:
   25933            0 :       gcc_unreachable ();
   25934              :     }
   25935              : 
   25936          260 :   wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
   25937          260 :   qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
   25938              : 
   25939          260 :   if (op2vec)
   25940          260 :     qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
   25941              :   else
   25942              :     qop2 = op2;
   25943              : 
   25944          260 :   hop1 = gen_reg_rtx (himode);
   25945          260 :   ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
   25946              : 
   25947          260 :   if (op2vec)
   25948              :     {
   25949          260 :       hop2 = gen_reg_rtx (himode);
   25950          260 :       ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
   25951              :     }
   25952              :   else
   25953              :     hop2 = qop2;
   25954              : 
   25955          260 :   if (code != MULT && op2vec)
   25956              :     {
   25957              :       /* Expand vashr/vlshr/vashl.  */
   25958           14 :       hdest = gen_reg_rtx (himode);
   25959           14 :       emit_insn (gen_rtx_SET (hdest,
   25960              :                               simplify_gen_binary (code, himode,
   25961              :                                                    hop1, hop2)));
   25962              :     }
   25963              :   else
   25964              :     /* Expand mult/ashr/lshr/ashl.  */
   25965          246 :     hdest = expand_simple_binop (himode, code, hop1, hop2,
   25966              :                                  NULL_RTX, 1, OPTAB_DIRECT);
   25967              : 
   25968          260 :   emit_insn (gen_truncate (dest, hdest));
   25969          260 :   return true;
   25970              : }
   25971              : 
   25972              : /* Expand a vector operation CODE for a V*QImode in terms of the
   25973              :    same operation on V*HImode.  */
   25974              : 
   25975              : void
   25976         1665 : ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   25977              : {
   25978         1665 :   machine_mode qimode = GET_MODE (dest);
   25979         1665 :   machine_mode himode;
   25980         1665 :   rtx (*gen_il) (rtx, rtx, rtx);
   25981         1665 :   rtx (*gen_ih) (rtx, rtx, rtx);
   25982         1665 :   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
   25983         1665 :   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   25984         1665 :   struct expand_vec_perm_d d;
   25985         1665 :   bool full_interleave = true;
   25986         1665 :   bool uns_p = code != ASHIFTRT;
   25987         1665 :   bool ok;
   25988         1665 :   int i;
   25989              : 
   25990         1665 :   if (CONST_INT_P (op2)
   25991          326 :       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
   25992         1991 :       && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
   25993          586 :     return;
   25994              : 
   25995         1339 :   if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
   25996              :     return;
   25997              : 
   25998         1079 :   switch (qimode)
   25999              :     {
   26000              :     case E_V16QImode:
   26001              :       himode = V8HImode;
   26002              :       break;
   26003          280 :     case E_V32QImode:
   26004          280 :       himode = V16HImode;
   26005          280 :       break;
   26006           34 :     case E_V64QImode:
   26007           34 :       himode = V32HImode;
   26008           34 :       break;
   26009            0 :     default:
   26010            0 :       gcc_unreachable ();
   26011              :     }
   26012              : 
   26013         1079 :   switch (code)
   26014              :     {
   26015         1071 :     case MULT:
   26016         1071 :       gcc_assert (op2vec);
   26017              :       /* Unpack data such that we've got a source byte in each low byte of
   26018              :          each word.  We don't care what goes into the high byte of each word.
   26019              :          Rather than trying to get zero in there, most convenient is to let
   26020              :          it be a copy of the low byte.  */
   26021         1071 :       switch (qimode)
   26022              :         {
   26023              :         case E_V16QImode:
   26024              :           gen_il = gen_vec_interleave_lowv16qi;
   26025              :           gen_ih = gen_vec_interleave_highv16qi;
   26026              :           break;
   26027          280 :         case E_V32QImode:
   26028          280 :           gen_il = gen_avx2_interleave_lowv32qi;
   26029          280 :           gen_ih = gen_avx2_interleave_highv32qi;
   26030          280 :           full_interleave = false;
   26031          280 :           break;
   26032           32 :         case E_V64QImode:
   26033           32 :           gen_il = gen_avx512bw_interleave_lowv64qi;
   26034           32 :           gen_ih = gen_avx512bw_interleave_highv64qi;
   26035           32 :           full_interleave = false;
   26036           32 :           break;
   26037            0 :         default:
   26038            0 :           gcc_unreachable ();
   26039              :         }
   26040              : 
   26041         1071 :       op2_l = gen_reg_rtx (qimode);
   26042         1071 :       op2_h = gen_reg_rtx (qimode);
   26043         1071 :       emit_insn (gen_il (op2_l, op2, op2));
   26044         1071 :       emit_insn (gen_ih (op2_h, op2, op2));
   26045              : 
   26046         1071 :       op1_l = gen_reg_rtx (qimode);
   26047         1071 :       op1_h = gen_reg_rtx (qimode);
   26048         1071 :       emit_insn (gen_il (op1_l, op1, op1));
   26049         1071 :       emit_insn (gen_ih (op1_h, op1, op1));
   26050         1071 :       break;
   26051              : 
   26052            8 :     case ASHIFT:
   26053            8 :     case ASHIFTRT:
   26054            8 :     case LSHIFTRT:
   26055            8 :       op1_l = gen_reg_rtx (himode);
   26056            8 :       op1_h = gen_reg_rtx (himode);
   26057            8 :       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
   26058            8 :       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
   26059              :       /* vashr/vlshr/vashl  */
   26060            8 :       if (op2vec)
   26061              :         {
   26062            2 :           rtx tmp = force_reg (qimode, op2);
   26063            2 :           op2_l = gen_reg_rtx (himode);
   26064            2 :           op2_h = gen_reg_rtx (himode);
   26065            2 :           ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
   26066            2 :           ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
   26067              :         }
   26068              :       else
   26069              :         op2_l = op2_h = op2;
   26070              : 
   26071              :       break;
   26072            0 :     default:
   26073            0 :       gcc_unreachable ();
   26074              :     }
   26075              : 
   26076         1079 :   if (code != MULT && op2vec)
   26077              :     {
   26078              :       /* Expand vashr/vlshr/vashl.  */
   26079            2 :       res_l = gen_reg_rtx (himode);
   26080            2 :       res_h = gen_reg_rtx (himode);
   26081            2 :       emit_insn (gen_rtx_SET (res_l,
   26082              :                               simplify_gen_binary (code, himode,
   26083              :                                                    op1_l, op2_l)));
   26084            2 :       emit_insn (gen_rtx_SET (res_h,
   26085              :                               simplify_gen_binary (code, himode,
   26086              :                                                    op1_h, op2_h)));
   26087              :     }
   26088              :   else
   26089              :     {
   26090              :       /* Expand mult/ashr/lshr/ashl.  */
   26091         1077 :       res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
   26092              :                                    1, OPTAB_DIRECT);
   26093         1077 :       res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
   26094              :                                    1, OPTAB_DIRECT);
   26095              :     }
   26096              : 
   26097         1079 :   gcc_assert (res_l && res_h);
   26098              : 
   26099              :   /* Merge the data back into the right place.  */
   26100         1079 :   d.target = dest;
   26101         1079 :   d.op0 = gen_lowpart (qimode, res_l);
   26102         1079 :   d.op1 = gen_lowpart (qimode, res_h);
   26103         1079 :   d.vmode = qimode;
   26104         1079 :   d.nelt = GET_MODE_NUNITS (qimode);
   26105         1079 :   d.one_operand_p = false;
   26106         1079 :   d.testing_p = false;
   26107              : 
   26108         1079 :   if (full_interleave)
   26109              :     {
   26110              :       /* We used the full interleave, the desired
   26111              :          results are in the even elements.  */
   26112        13135 :       for (i = 0; i < d.nelt; ++i)
   26113        12368 :         d.perm[i] = i * 2;
   26114              :     }
   26115              :   else
   26116              :     {
   26117              :       /* For AVX, the interleave used above was not cross-lane.  So the
   26118              :          extraction is evens but with the second and third quarter swapped.
   26119              :          Happily, that is even one insn shorter than even extraction.
   26120              :          For AVX512BW we have 4 lanes.  We extract evens from within a lane,
   26121              :          always first from the first and then from the second source operand,
   26122              :          the index bits above the low 4 bits remains the same.
   26123              :          Thus, for d.nelt == 32 we want permutation
   26124              :          0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
   26125              :          and for d.nelt == 64 we want permutation
   26126              :          0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
   26127              :          32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
   26128        11320 :       for (i = 0; i < d.nelt; ++i)
   26129        16512 :         d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
   26130              :     }
   26131              : 
   26132         1079 :   ok = ix86_expand_vec_perm_const_1 (&d);
   26133         1079 :   gcc_assert (ok);
   26134              : }
   26135              : 
   26136              : /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
   26137              :    if op is CONST_VECTOR with all odd elements equal to their
   26138              :    preceding element.  */
   26139              : 
   26140              : static bool
   26141         8756 : const_vector_equal_evenodd_p (rtx op)
   26142              : {
   26143         8756 :   machine_mode mode = GET_MODE (op);
   26144         8756 :   int i, nunits = GET_MODE_NUNITS (mode);
   26145         8756 :   if (!CONST_VECTOR_P (op)
   26146         8756 :       || nunits != CONST_VECTOR_NUNITS (op))
   26147              :     return false;
   26148         3574 :   for (i = 0; i < nunits; i += 2)
   26149         2882 :     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
   26150              :       return false;
   26151              :   return true;
   26152              : }
   26153              : 
   26154              : void
   26155         8873 : ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
   26156              :                                bool uns_p, bool odd_p)
   26157              : {
   26158         8873 :   machine_mode mode = GET_MODE (op1);
   26159         8873 :   machine_mode wmode = GET_MODE (dest);
   26160         8873 :   rtx x;
   26161         8873 :   rtx orig_op1 = op1, orig_op2 = op2;
   26162              : 
   26163         8873 :   if (!nonimmediate_operand (op1, mode))
   26164            0 :     op1 = force_reg (mode, op1);
   26165         8873 :   if (!nonimmediate_operand (op2, mode))
   26166         3315 :     op2 = force_reg (mode, op2);
   26167              : 
   26168              :   /* We only play even/odd games with vectors of SImode.  */
   26169         8873 :   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
   26170              : 
   26171              :   /* If we're looking for the odd results, shift those members down to
   26172              :      the even slots.  For some cpus this is faster than a PSHUFD.  */
   26173         8873 :   if (odd_p)
   26174              :     {
   26175              :       /* For XOP use vpmacsdqh, but only for smult, as it is only
   26176              :          signed.  */
   26177         4396 :       if (TARGET_XOP && mode == V4SImode && !uns_p)
   26178              :         {
   26179           18 :           x = force_reg (wmode, CONST0_RTX (wmode));
   26180           18 :           emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
   26181           18 :           return;
   26182              :         }
   26183              : 
   26184         8756 :       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
   26185         4378 :       if (!const_vector_equal_evenodd_p (orig_op1))
   26186         4378 :         op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
   26187              :                             x, NULL, 1, OPTAB_DIRECT);
   26188         4378 :       if (!const_vector_equal_evenodd_p (orig_op2))
   26189         3686 :         op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
   26190              :                             x, NULL, 1, OPTAB_DIRECT);
   26191         4378 :       op1 = gen_lowpart (mode, op1);
   26192         4378 :       op2 = gen_lowpart (mode, op2);
   26193              :     }
   26194              : 
   26195         8855 :   if (mode == V16SImode)
   26196              :     {
   26197           10 :       if (uns_p)
   26198            0 :         x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
   26199              :       else
   26200           10 :         x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
   26201              :     }
   26202         8845 :   else if (mode == V8SImode)
   26203              :     {
   26204          147 :       if (uns_p)
   26205           59 :         x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
   26206              :       else
   26207           88 :         x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
   26208              :     }
   26209         8698 :   else if (uns_p)
   26210         7643 :     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
   26211         1055 :   else if (TARGET_SSE4_1)
   26212          369 :     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
   26213              :   else
   26214              :     {
   26215          686 :       rtx s1, s2, t0, t1, t2;
   26216              : 
   26217              :       /* The easiest way to implement this without PMULDQ is to go through
   26218              :          the motions as if we are performing a full 64-bit multiply.  With
   26219              :          the exception that we need to do less shuffling of the elements.  */
   26220              : 
   26221              :       /* Compute the sign-extension, aka highparts, of the two operands.  */
   26222          686 :       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
   26223              :                                 op1, pc_rtx, pc_rtx);
   26224          686 :       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
   26225              :                                 op2, pc_rtx, pc_rtx);
   26226              : 
   26227              :       /* Multiply LO(A) * HI(B), and vice-versa.  */
   26228          686 :       t1 = gen_reg_rtx (wmode);
   26229          686 :       t2 = gen_reg_rtx (wmode);
   26230          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
   26231          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
   26232              : 
   26233              :       /* Multiply LO(A) * LO(B).  */
   26234          686 :       t0 = gen_reg_rtx (wmode);
   26235          686 :       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
   26236              : 
   26237              :       /* Combine and shift the highparts into place.  */
   26238          686 :       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
   26239          686 :       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
   26240              :                          1, OPTAB_DIRECT);
   26241              : 
   26242              :       /* Combine high and low parts.  */
   26243          686 :       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
   26244          686 :       return;
   26245              :     }
   26246         8169 :   emit_insn (x);
   26247              : }
   26248              : 
   26249              : void
   26250          983 : ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
   26251              :                             bool uns_p, bool high_p)
   26252              : {
   26253          983 :   machine_mode wmode = GET_MODE (dest);
   26254          983 :   machine_mode mode = GET_MODE (op1);
   26255          983 :   rtx t1, t2, t3, t4, mask;
   26256              : 
   26257          983 :   switch (mode)
   26258              :     {
   26259          297 :     case E_V4SImode:
   26260          297 :       t1 = gen_reg_rtx (mode);
   26261          297 :       t2 = gen_reg_rtx (mode);
   26262          297 :       if (TARGET_XOP && !uns_p)
   26263              :         {
   26264              :           /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
   26265              :              shuffle the elements once so that all elements are in the right
   26266              :              place for immediate use: { A C B D }.  */
   26267           33 :           emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
   26268              :                                         const1_rtx, GEN_INT (3)));
   26269           33 :           emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
   26270              :                                         const1_rtx, GEN_INT (3)));
   26271              :         }
   26272              :       else
   26273              :         {
   26274              :           /* Put the elements into place for the multiply.  */
   26275          264 :           ix86_expand_vec_interleave (t1, op1, op1, high_p);
   26276          264 :           ix86_expand_vec_interleave (t2, op2, op2, high_p);
   26277          264 :           high_p = false;
   26278              :         }
   26279          297 :       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
   26280          297 :       break;
   26281              : 
   26282           78 :     case E_V8SImode:
   26283              :       /* Shuffle the elements between the lanes.  After this we
   26284              :          have { A B E F | C D G H } for each operand.  */
   26285           78 :       t1 = gen_reg_rtx (V4DImode);
   26286           78 :       t2 = gen_reg_rtx (V4DImode);
   26287           78 :       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
   26288              :                                       const0_rtx, const2_rtx,
   26289              :                                       const1_rtx, GEN_INT (3)));
   26290           78 :       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
   26291              :                                       const0_rtx, const2_rtx,
   26292              :                                       const1_rtx, GEN_INT (3)));
   26293              : 
   26294              :       /* Shuffle the elements within the lanes.  After this we
   26295              :          have { A A B B | C C D D } or { E E F F | G G H H }.  */
   26296           78 :       t3 = gen_reg_rtx (V8SImode);
   26297           78 :       t4 = gen_reg_rtx (V8SImode);
   26298          117 :       mask = GEN_INT (high_p
   26299              :                       ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
   26300              :                       : 0 + (0 << 2) + (1 << 4) + (1 << 6));
   26301           78 :       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
   26302           78 :       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
   26303              : 
   26304           78 :       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
   26305           78 :       break;
   26306              : 
   26307          396 :     case E_V8HImode:
   26308          396 :     case E_V16HImode:
   26309          396 :       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
   26310              :                          uns_p, OPTAB_DIRECT);
   26311          630 :       t2 = expand_binop (mode,
   26312              :                          uns_p ? umul_highpart_optab : smul_highpart_optab,
   26313              :                          op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
   26314          396 :       gcc_assert (t1 && t2);
   26315              : 
   26316          396 :       t3 = gen_reg_rtx (mode);
   26317          396 :       ix86_expand_vec_interleave (t3, t1, t2, high_p);
   26318          396 :       emit_move_insn (dest, gen_lowpart (wmode, t3));
   26319          396 :       break;
   26320              : 
   26321          212 :     case E_V16QImode:
   26322          212 :     case E_V32QImode:
   26323          212 :     case E_V32HImode:
   26324          212 :     case E_V16SImode:
   26325          212 :     case E_V64QImode:
   26326          212 :       t1 = gen_reg_rtx (wmode);
   26327          212 :       t2 = gen_reg_rtx (wmode);
   26328          212 :       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
   26329          212 :       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
   26330              : 
   26331          212 :       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
   26332          212 :       break;
   26333              : 
   26334            0 :     default:
   26335            0 :       gcc_unreachable ();
   26336              :     }
   26337          983 : }
   26338              : 
   26339              : void
   26340         3654 : ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
   26341              : {
   26342         3654 :   rtx res_1, res_2, res_3, res_4;
   26343              : 
   26344         3654 :   res_1 = gen_reg_rtx (V4SImode);
   26345         3654 :   res_2 = gen_reg_rtx (V4SImode);
   26346         3654 :   res_3 = gen_reg_rtx (V2DImode);
   26347         3654 :   res_4 = gen_reg_rtx (V2DImode);
   26348         3654 :   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
   26349         3654 :   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
   26350              : 
   26351              :   /* Move the results in element 2 down to element 1; we don't care
   26352              :      what goes in elements 2 and 3.  Then we can merge the parts
   26353              :      back together with an interleave.
   26354              : 
   26355              :      Note that two other sequences were tried:
   26356              :      (1) Use interleaves at the start instead of psrldq, which allows
   26357              :      us to use a single shufps to merge things back at the end.
   26358              :      (2) Use shufps here to combine the two vectors, then pshufd to
   26359              :      put the elements in the correct order.
   26360              :      In both cases the cost of the reformatting stall was too high
   26361              :      and the overall sequence slower.  */
   26362              : 
   26363         3654 :   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
   26364              :                                 const0_rtx, const2_rtx,
   26365              :                                 const0_rtx, const0_rtx));
   26366         3654 :   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
   26367              :                                 const0_rtx, const2_rtx,
   26368              :                                 const0_rtx, const0_rtx));
   26369         3654 :   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
   26370              : 
   26371         3654 :   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
   26372         3654 : }
   26373              : 
   26374              : void
   26375          535 : ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
   26376              : {
   26377          535 :   machine_mode mode = GET_MODE (op0);
   26378          535 :   rtx t1, t2, t3, t4, t5, t6;
   26379              : 
   26380          535 :   if (TARGET_AVX512DQ && mode == V8DImode)
   26381           32 :     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
   26382          503 :   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
   26383           32 :     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
   26384          471 :   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
   26385           36 :     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
   26386          435 :   else if (TARGET_XOP && mode == V2DImode)
   26387              :     {
   26388              :       /* op1: A,B,C,D, op2: E,F,G,H */
   26389            2 :       op1 = gen_lowpart (V4SImode, op1);
   26390            2 :       op2 = gen_lowpart (V4SImode, op2);
   26391              : 
   26392            2 :       t1 = gen_reg_rtx (V4SImode);
   26393            2 :       t2 = gen_reg_rtx (V4SImode);
   26394            2 :       t3 = gen_reg_rtx (V2DImode);
   26395            2 :       t4 = gen_reg_rtx (V2DImode);
   26396              : 
   26397              :       /* t1: B,A,D,C */
   26398            2 :       emit_insn (gen_sse2_pshufd_1 (t1, op1,
   26399              :                                     GEN_INT (1),
   26400              :                                     GEN_INT (0),
   26401              :                                     GEN_INT (3),
   26402              :                                     GEN_INT (2)));
   26403              : 
   26404              :       /* t2: (B*E),(A*F),(D*G),(C*H) */
   26405            2 :       emit_insn (gen_mulv4si3 (t2, t1, op2));
   26406              : 
   26407              :       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
   26408            2 :       emit_insn (gen_xop_phadddq (t3, t2));
   26409              : 
   26410              :       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
   26411            2 :       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
   26412              : 
   26413              :       /* Multiply lower parts and add all */
   26414            2 :       t5 = gen_reg_rtx (V2DImode);
   26415            2 :       emit_insn (gen_vec_widen_umult_even_v4si (t5,
   26416            2 :                                         gen_lowpart (V4SImode, op1),
   26417            2 :                                         gen_lowpart (V4SImode, op2)));
   26418            2 :       force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
   26419              :     }
   26420              :   else
   26421              :     {
   26422          433 :       machine_mode nmode;
   26423          433 :       rtx (*umul) (rtx, rtx, rtx);
   26424              : 
   26425          433 :       if (mode == V2DImode)
   26426              :         {
   26427              :           umul = gen_vec_widen_umult_even_v4si;
   26428              :           nmode = V4SImode;
   26429              :         }
   26430          327 :       else if (mode == V4DImode)
   26431              :         {
   26432              :           umul = gen_vec_widen_umult_even_v8si;
   26433              :           nmode = V8SImode;
   26434              :         }
   26435          116 :       else if (mode == V8DImode)
   26436              :         {
   26437              :           umul = gen_vec_widen_umult_even_v16si;
   26438              :           nmode = V16SImode;
   26439              :         }
   26440              :       else
   26441            0 :         gcc_unreachable ();
   26442              : 
   26443              : 
   26444              :       /* Multiply low parts.  */
   26445          433 :       t1 = gen_reg_rtx (mode);
   26446          433 :       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
   26447              : 
   26448              :       /* Shift input vectors right 32 bits so we can multiply high parts.  */
   26449          433 :       t6 = GEN_INT (32);
   26450          433 :       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
   26451          433 :       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
   26452              : 
   26453              :       /* Multiply high parts by low parts.  */
   26454          433 :       t4 = gen_reg_rtx (mode);
   26455          433 :       t5 = gen_reg_rtx (mode);
   26456          433 :       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
   26457          433 :       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
   26458              : 
   26459              :       /* Combine and shift the highparts back.  */
   26460          433 :       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
   26461          433 :       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
   26462              : 
   26463              :       /* Combine high and low parts.  */
   26464          433 :       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
   26465              :     }
   26466              : 
   26467          535 :   set_unique_reg_note (get_last_insn (), REG_EQUAL,
   26468              :                        gen_rtx_MULT (mode, op1, op2));
   26469          535 : }
   26470              : 
   26471              : /* Return 1 if control tansfer instruction INSN
   26472              :    should be encoded with notrack prefix.  */
   26473              : 
   26474              : bool
   26475     14857979 : ix86_notrack_prefixed_insn_p (rtx_insn *insn)
   26476              : {
   26477     14857979 :   if (!insn || !((flag_cf_protection & CF_BRANCH)))
   26478              :     return false;
   26479              : 
   26480      3983300 :   if (CALL_P (insn))
   26481              :     {
   26482      1376770 :       rtx call = get_call_rtx_from (insn);
   26483      1376770 :       gcc_assert (call != NULL_RTX);
   26484      1376770 :       rtx addr = XEXP (call, 0);
   26485              : 
   26486              :       /* Do not emit 'notrack' if it's not an indirect call.  */
   26487      1376770 :       if (MEM_P (addr)
   26488      1376770 :           && SYMBOL_REF_P (XEXP (addr, 0)))
   26489              :         return false;
   26490              :       else
   26491        68936 :         return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
   26492              :     }
   26493              : 
   26494      2606530 :   if (JUMP_P (insn) && !flag_cet_switch)
   26495              :     {
   26496      2593159 :       rtx target = JUMP_LABEL (insn);
   26497      2593159 :       if (target == NULL_RTX || ANY_RETURN_P (target))
   26498              :         return false;
   26499              : 
   26500              :       /* Check the jump is a switch table.  */
   26501      2593121 :       rtx_insn *label = as_a<rtx_insn *> (target);
   26502      2593121 :       rtx_insn *table = next_insn (label);
   26503      2593121 :       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
   26504              :         return false;
   26505              :       else
   26506              :         return true;
   26507              :     }
   26508              :   return false;
   26509              : }
   26510              : 
   26511              : /* Calculate integer abs() using only SSE2 instructions.  */
   26512              : 
   26513              : void
   26514          557 : ix86_expand_sse2_abs (rtx target, rtx input)
   26515              : {
   26516          557 :   machine_mode mode = GET_MODE (target);
   26517          557 :   rtx tmp0, tmp1, x;
   26518              : 
   26519          557 :   switch (mode)
   26520              :     {
   26521           24 :     case E_V2DImode:
   26522           24 :     case E_V4DImode:
   26523              :       /* For 64-bit signed integer X, with SSE4.2 use
   26524              :          pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
   26525              :          Otherwise handle it similarly to V4SImode, except use 64 as W instead of
   26526              :          32 and use logical instead of arithmetic right shift (which is
   26527              :          unimplemented) and subtract.  */
   26528           24 :       if (TARGET_SSE4_2)
   26529              :         {
   26530            9 :           tmp0 = gen_reg_rtx (mode);
   26531            9 :           tmp1 = gen_reg_rtx (mode);
   26532            9 :           emit_move_insn (tmp1, CONST0_RTX (mode));
   26533            9 :           if (mode == E_V2DImode)
   26534            6 :             emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
   26535              :           else
   26536            3 :             emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
   26537              :         }
   26538              :       else
   26539              :         {
   26540           30 :           tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
   26541           15 :                                       GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
   26542              :                                                - 1), NULL, 0, OPTAB_DIRECT);
   26543           15 :           tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
   26544              :         }
   26545              : 
   26546           24 :       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
   26547              :                                   NULL, 0, OPTAB_DIRECT);
   26548           24 :       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
   26549              :                                target, 0, OPTAB_DIRECT);
   26550           24 :       break;
   26551              : 
   26552           49 :     case E_V4SImode:
   26553              :       /* For 32-bit signed integer X, the best way to calculate the absolute
   26554              :          value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
   26555           49 :       tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
   26556           49 :                                   GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
   26557              :                                   NULL, 0, OPTAB_DIRECT);
   26558           49 :       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
   26559              :                                   NULL, 0, OPTAB_DIRECT);
   26560           49 :       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
   26561              :                                target, 0, OPTAB_DIRECT);
   26562           49 :       break;
   26563              : 
   26564           85 :     case E_V8HImode:
   26565              :       /* For 16-bit signed integer X, the best way to calculate the absolute
   26566              :          value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
   26567           85 :       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
   26568              : 
   26569           85 :       x = expand_simple_binop (mode, SMAX, tmp0, input,
   26570              :                                target, 0, OPTAB_DIRECT);
   26571           85 :       break;
   26572              : 
   26573          399 :     case E_V16QImode:
   26574              :       /* For 8-bit signed integer X, the best way to calculate the absolute
   26575              :          value of X is min ((unsigned char) X, (unsigned char) (-X)),
   26576              :          as SSE2 provides the PMINUB insn.  */
   26577          399 :       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
   26578              : 
   26579          399 :       x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
   26580              :                                target, 0, OPTAB_DIRECT);
   26581          399 :       break;
   26582              : 
   26583            0 :     default:
   26584            0 :       gcc_unreachable ();
   26585              :     }
   26586              : 
   26587          557 :   if (x != target)
   26588            0 :     emit_move_insn (target, x);
   26589          557 : }
   26590              : 
   26591              : /* Expand an extract from a vector register through pextr insn.
   26592              :    Return true if successful.  */
   26593              : 
   26594              : bool
   26595       104654 : ix86_expand_pextr (rtx *operands)
   26596              : {
   26597       104654 :   rtx dst = operands[0];
   26598       104654 :   rtx src = operands[1];
   26599              : 
   26600       104654 :   unsigned int size = INTVAL (operands[2]);
   26601       104654 :   unsigned int pos = INTVAL (operands[3]);
   26602              : 
   26603       104654 :   if (SUBREG_P (dst))
   26604              :     {
   26605              :       /* Reject non-lowpart subregs.  */
   26606        61317 :       if (SUBREG_BYTE (dst) > 0)
   26607              :         return false;
   26608        61222 :       dst = SUBREG_REG (dst);
   26609              :     }
   26610              : 
   26611       104559 :   if (SUBREG_P (src))
   26612              :     {
   26613        36264 :       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
   26614        36264 :       src = SUBREG_REG (src);
   26615              :     }
   26616              : 
   26617       104559 :   switch (GET_MODE (src))
   26618              :     {
   26619            0 :     case E_V16QImode:
   26620            0 :     case E_V8HImode:
   26621            0 :     case E_V4SImode:
   26622            0 :     case E_V2DImode:
   26623            0 :     case E_V1TImode:
   26624            0 :       {
   26625            0 :         machine_mode srcmode, dstmode;
   26626            0 :         rtx d, pat;
   26627              : 
   26628            0 :         if (!int_mode_for_size (size, 0).exists (&dstmode))
   26629            0 :           return false;
   26630              : 
   26631            0 :         switch (dstmode)
   26632              :           {
   26633            0 :           case E_QImode:
   26634            0 :             if (!TARGET_SSE4_1)
   26635              :               return false;
   26636              :             srcmode = V16QImode;
   26637              :             break;
   26638              : 
   26639            0 :           case E_HImode:
   26640            0 :             if (!TARGET_SSE2)
   26641              :               return false;
   26642              :             srcmode = V8HImode;
   26643              :             break;
   26644              : 
   26645            0 :           case E_SImode:
   26646            0 :             if (!TARGET_SSE4_1)
   26647              :               return false;
   26648              :             srcmode = V4SImode;
   26649              :             break;
   26650              : 
   26651            0 :           case E_DImode:
   26652            0 :             gcc_assert (TARGET_64BIT);
   26653            0 :             if (!TARGET_SSE4_1)
   26654              :               return false;
   26655              :             srcmode = V2DImode;
   26656              :             break;
   26657              : 
   26658              :           default:
   26659              :             return false;
   26660              :           }
   26661              : 
   26662              :         /* Reject extractions from misaligned positions.  */
   26663            0 :         if (pos & (size-1))
   26664              :           return false;
   26665              : 
   26666            0 :         if (GET_MODE (dst) == dstmode)
   26667              :           d = dst;
   26668              :         else
   26669            0 :           d = gen_reg_rtx (dstmode);
   26670              : 
   26671              :         /* Construct insn pattern.  */
   26672            0 :         pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
   26673            0 :         pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
   26674              : 
   26675              :         /* Let the rtl optimizers know about the zero extension performed.  */
   26676            0 :         if (dstmode == QImode || dstmode == HImode)
   26677              :           {
   26678            0 :             pat = gen_rtx_ZERO_EXTEND (SImode, pat);
   26679            0 :             d = gen_lowpart (SImode, d);
   26680              :           }
   26681              : 
   26682            0 :         emit_insn (gen_rtx_SET (d, pat));
   26683              : 
   26684            0 :         if (d != dst)
   26685            0 :           emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
   26686              :         return true;
   26687              :       }
   26688              : 
   26689              :     default:
   26690              :       return false;
   26691              :     }
   26692              : }
   26693              : 
   26694              : /* Expand an insert into a vector register through pinsr insn.
   26695              :    Return true if successful.  */
   26696              : 
   26697              : bool
   26698       108491 : ix86_expand_pinsr (rtx *operands)
   26699              : {
   26700       108491 :   rtx dst = operands[0];
   26701       108491 :   rtx src = operands[3];
   26702              : 
   26703       108491 :   unsigned int size = INTVAL (operands[1]);
   26704       108491 :   unsigned int pos = INTVAL (operands[2]);
   26705              : 
   26706       108491 :   if (SUBREG_P (dst))
   26707              :     {
   26708        60572 :       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
   26709        60572 :       dst = SUBREG_REG (dst);
   26710              :     }
   26711              : 
   26712       108491 :   switch (GET_MODE (dst))
   26713              :     {
   26714           20 :     case E_V16QImode:
   26715           20 :     case E_V8HImode:
   26716           20 :     case E_V4SImode:
   26717           20 :     case E_V2DImode:
   26718           20 :     case E_V1TImode:
   26719           20 :       {
   26720           20 :         machine_mode srcmode, dstmode;
   26721           20 :         rtx (*pinsr)(rtx, rtx, rtx, rtx);
   26722           20 :         rtx d;
   26723              : 
   26724           20 :         if (!int_mode_for_size (size, 0).exists (&srcmode))
   26725            0 :           return false;
   26726              : 
   26727           20 :         switch (srcmode)
   26728              :           {
   26729            1 :           case E_QImode:
   26730            1 :             if (!TARGET_SSE4_1)
   26731              :               return false;
   26732              :             dstmode = V16QImode;
   26733              :             pinsr = gen_sse4_1_pinsrb;
   26734              :             break;
   26735              : 
   26736            5 :           case E_HImode:
   26737            5 :             if (!TARGET_SSE2)
   26738              :               return false;
   26739              :             dstmode = V8HImode;
   26740              :             pinsr = gen_sse2_pinsrw;
   26741              :             break;
   26742              : 
   26743           14 :           case E_SImode:
   26744           14 :             if (!TARGET_SSE4_1)
   26745              :               return false;
   26746              :             dstmode = V4SImode;
   26747              :             pinsr = gen_sse4_1_pinsrd;
   26748              :             break;
   26749              : 
   26750            0 :           case E_DImode:
   26751            0 :             gcc_assert (TARGET_64BIT);
   26752            0 :             if (!TARGET_SSE4_1)
   26753              :               return false;
   26754              :             dstmode = V2DImode;
   26755              :             pinsr = gen_sse4_1_pinsrq;
   26756              :             break;
   26757              : 
   26758              :           default:
   26759              :             return false;
   26760              :           }
   26761              : 
   26762              :         /* Reject insertions to misaligned positions.  */
   26763            7 :         if (pos & (size-1))
   26764              :           return false;
   26765              : 
   26766            7 :         if (SUBREG_P (src))
   26767              :           {
   26768            7 :             unsigned int srcpos = SUBREG_BYTE (src);
   26769              : 
   26770            7 :             if (srcpos > 0)
   26771              :               {
   26772            0 :                 rtx extr_ops[4];
   26773              : 
   26774            0 :                 extr_ops[0] = gen_reg_rtx (srcmode);
   26775            0 :                 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
   26776            0 :                 extr_ops[2] = GEN_INT (size);
   26777            0 :                 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
   26778              : 
   26779            0 :                 if (!ix86_expand_pextr (extr_ops))
   26780            0 :                   return false;
   26781              : 
   26782            0 :                 src = extr_ops[0];
   26783              :               }
   26784              :             else
   26785            7 :               src = gen_lowpart (srcmode, SUBREG_REG (src));
   26786              :           }
   26787              : 
   26788            7 :         if (GET_MODE (dst) == dstmode)
   26789              :           d = dst;
   26790              :         else
   26791            7 :           d = gen_reg_rtx (dstmode);
   26792              : 
   26793            7 :         emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
   26794            7 :                           gen_lowpart (srcmode, src),
   26795            7 :                           GEN_INT (1 << (pos / size))));
   26796            7 :         if (d != dst)
   26797            7 :           emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
   26798              :         return true;
   26799              :       }
   26800              : 
   26801              :     default:
   26802              :       return false;
   26803              :     }
   26804              : }
   26805              : 
   26806              : /* All CPUs prefer to avoid cross-lane operations so perform reductions
   26807              :    upper against lower halves up to SSE reg size.  */
   26808              : 
   26809              : machine_mode
   26810         1892 : ix86_split_reduction (machine_mode mode)
   26811              : {
   26812              :   /* Reduce lowpart against highpart until we reach SSE reg width to
   26813              :      avoid cross-lane operations.  */
   26814         1892 :   switch (mode)
   26815              :     {
   26816              :     case E_V8DImode:
   26817              :     case E_V4DImode:
   26818              :       return V2DImode;
   26819            9 :     case E_V16SImode:
   26820            9 :     case E_V8SImode:
   26821            9 :       return V4SImode;
   26822            8 :     case E_V32HImode:
   26823            8 :     case E_V16HImode:
   26824            8 :       return V8HImode;
   26825            4 :     case E_V64QImode:
   26826            4 :     case E_V32QImode:
   26827            4 :       return V16QImode;
   26828            5 :     case E_V16SFmode:
   26829            5 :     case E_V8SFmode:
   26830            5 :       return V4SFmode;
   26831           16 :     case E_V8DFmode:
   26832           16 :     case E_V4DFmode:
   26833           16 :       return V2DFmode;
   26834         1845 :     default:
   26835         1845 :       return mode;
   26836              :     }
   26837              : }
   26838              : 
   26839              : /* Generate call to __divmoddi4.  */
   26840              : 
   26841              : void
   26842          897 : ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
   26843              :                             rtx op0, rtx op1,
   26844              :                             rtx *quot_p, rtx *rem_p)
   26845              : {
   26846         1794 :   rtx rem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   26847              : 
   26848          897 :   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
   26849              :                                       mode, op0, mode, op1, mode,
   26850          897 :                                       XEXP (rem, 0), Pmode);
   26851          897 :   *quot_p = quot;
   26852          897 :   *rem_p = rem;
   26853          897 : }
   26854              : 
   26855              : void
   26856           64 : ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
   26857              :                                   enum rtx_code code, bool after,
   26858              :                                   bool doubleword)
   26859              : {
   26860           64 :   rtx old_reg, new_reg, old_mem, success;
   26861           64 :   machine_mode mode = GET_MODE (target);
   26862           64 :   rtx_code_label *loop_label = NULL;
   26863              : 
   26864           64 :   old_reg = gen_reg_rtx (mode);
   26865           64 :   new_reg = old_reg;
   26866           64 :   old_mem = copy_to_reg (mem);
   26867           64 :   loop_label = gen_label_rtx ();
   26868           64 :   emit_label (loop_label);
   26869           64 :   emit_move_insn (old_reg, old_mem);
   26870              : 
   26871              :   /* return value for atomic_fetch_op.  */
   26872           64 :   if (!after)
   26873           32 :     emit_move_insn (target, old_reg);
   26874              : 
   26875           64 :   if (code == NOT)
   26876              :     {
   26877           16 :       new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
   26878              :                                      true, OPTAB_LIB_WIDEN);
   26879           16 :       new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
   26880              :     }
   26881              :   else
   26882           48 :     new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
   26883              :                                    true, OPTAB_LIB_WIDEN);
   26884              : 
   26885              :   /* return value for atomic_op_fetch.  */
   26886           64 :   if (after)
   26887           32 :     emit_move_insn (target, new_reg);
   26888              : 
   26889           64 :   success = NULL_RTX;
   26890              : 
   26891           64 :   ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
   26892              :                             gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
   26893              :                                           SImode),
   26894              :                             doubleword, loop_label);
   26895           64 : }
   26896              : 
   26897              : /* Relax cmpxchg instruction, param loop_label indicates whether
   26898              :    the instruction should be relaxed with a pause loop.  If not,
   26899              :    it will be relaxed to an atomic load + compare, and skip
   26900              :    cmpxchg instruction if mem != exp_input.  */
   26901              : 
   26902              : void
   26903           72 : ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
   26904              :                           rtx mem, rtx exp_input, rtx new_input,
   26905              :                           rtx mem_model, bool doubleword,
   26906              :                           rtx_code_label *loop_label)
   26907              : {
   26908           72 :   rtx_code_label *cmp_label = NULL;
   26909           72 :   rtx_code_label *done_label = NULL;
   26910           72 :   rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
   26911           72 :   rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
   26912           72 :   rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
   26913           72 :   machine_mode mode = GET_MODE (target_val), hmode = mode;
   26914              : 
   26915           72 :   if (*ptarget_bool == NULL)
   26916           64 :     target_bool = gen_reg_rtx (QImode);
   26917              :   else
   26918              :     target_bool = *ptarget_bool;
   26919              : 
   26920           72 :   cmp_label = gen_label_rtx ();
   26921           72 :   done_label = gen_label_rtx ();
   26922              : 
   26923           72 :   new_mem = gen_reg_rtx (mode);
   26924              :   /* Load memory first.  */
   26925           72 :   expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
   26926              : 
   26927           72 :   switch (mode)
   26928              :     {
   26929              :     case E_TImode:
   26930              :       gendw = gen_atomic_compare_and_swapti_doubleword;
   26931              :       hmode = DImode;
   26932              :       break;
   26933           18 :     case E_DImode:
   26934           18 :       if (doubleword)
   26935              :         {
   26936              :           gendw = gen_atomic_compare_and_swapdi_doubleword;
   26937              :           hmode = SImode;
   26938              :         }
   26939              :       else
   26940              :         gen = gen_atomic_compare_and_swapdi_1;
   26941              :       break;
   26942           18 :     case E_SImode:
   26943           18 :       gen = gen_atomic_compare_and_swapsi_1;
   26944           18 :       break;
   26945           18 :     case E_HImode:
   26946           18 :       gen = gen_atomic_compare_and_swaphi_1;
   26947           18 :       break;
   26948           18 :     case E_QImode:
   26949           18 :       gen = gen_atomic_compare_and_swapqi_1;
   26950           18 :       break;
   26951            0 :     default:
   26952            0 :       gcc_unreachable ();
   26953              :     }
   26954              : 
   26955              :   /* Compare mem value with expected value.  */
   26956           54 :   if (doubleword)
   26957              :     {
   26958            0 :       rtx low_new_mem = gen_lowpart (hmode, new_mem);
   26959            0 :       rtx low_exp_input = gen_lowpart (hmode, exp_input);
   26960            0 :       rtx high_new_mem = gen_highpart (hmode, new_mem);
   26961            0 :       rtx high_exp_input = gen_highpart (hmode, exp_input);
   26962            0 :       emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
   26963              :                                hmode, 1, cmp_label,
   26964              :                                profile_probability::guessed_never ());
   26965            0 :       emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
   26966              :                                hmode, 1, cmp_label,
   26967              :                                profile_probability::guessed_never ());
   26968              :     }
   26969              :   else
   26970           72 :     emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
   26971           72 :                              GET_MODE (exp_input), 1, cmp_label,
   26972              :                              profile_probability::guessed_never ());
   26973              : 
   26974              :   /* Directly emits cmpxchg here.  */
   26975           72 :   if (doubleword)
   26976            0 :     emit_insn (gendw (target_val, mem, exp_input,
   26977            0 :                       gen_lowpart (hmode, new_input),
   26978              :                       gen_highpart (hmode, new_input),
   26979              :                       mem_model));
   26980              :   else
   26981           72 :     emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
   26982              : 
   26983           72 :   if (!loop_label)
   26984              :   {
   26985            8 :     emit_jump_insn (gen_jump (done_label));
   26986            8 :     emit_barrier ();
   26987            8 :     emit_label (cmp_label);
   26988            8 :     emit_move_insn (target_val, new_mem);
   26989            8 :     emit_label (done_label);
   26990            8 :     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
   26991              :                        const0_rtx);
   26992              :   }
   26993              :   else
   26994              :   {
   26995           64 :     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
   26996              :                        const0_rtx);
   26997           64 :     emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
   26998           64 :                              GET_MODE (target_bool), 1, loop_label,
   26999              :                              profile_probability::guessed_never ());
   27000           64 :     emit_jump_insn (gen_jump (done_label));
   27001           64 :     emit_barrier ();
   27002              : 
   27003              :     /* If mem is not expected, pause and loop back.  */
   27004           64 :     emit_label (cmp_label);
   27005           64 :     emit_move_insn (target_val, new_mem);
   27006           64 :     emit_insn (gen_pause ());
   27007           64 :     emit_jump_insn (gen_jump (loop_label));
   27008           64 :     emit_barrier ();
   27009           64 :     emit_label (done_label);
   27010              :   }
   27011              : 
   27012           72 :   *ptarget_bool = target_bool;
   27013           72 : }
   27014              : 
   27015              : /* Convert a BFmode VAL to SFmode without signaling sNaNs.
   27016              :    This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16.  */
   27017              : 
   27018              : rtx
   27019         2832 : ix86_expand_fast_convert_bf_to_sf (rtx val)
   27020              : {
   27021         2832 :   rtx op = gen_lowpart (HImode, val), ret;
   27022         2832 :   if (CONST_INT_P (op))
   27023              :     {
   27024          514 :       ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
   27025              :                                             val, BFmode);
   27026          514 :       if (ret)
   27027              :         return ret;
   27028              :       /* FLOAT_EXTEND simplification will fail if VAL is a sNaN.  */
   27029            1 :       ret = gen_reg_rtx (SImode);
   27030            1 :       emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
   27031            1 :       emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
   27032            1 :       return gen_lowpart (SFmode, ret);
   27033              :     }
   27034              : 
   27035         2318 :   ret = gen_reg_rtx (SFmode);
   27036         2318 :   emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
   27037         2318 :   return ret;
   27038              : }
   27039              : 
   27040              : rtx
   27041        65576 : ix86_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
   27042              :                         rtx_code code, tree treeop0, tree treeop1)
   27043              : {
   27044        65576 :   if (!TARGET_APX_CCMP)
   27045              :     return NULL_RTX;
   27046              : 
   27047        65576 :   rtx op0, op1, res;
   27048        65576 :   machine_mode op_mode;
   27049              : 
   27050        65576 :   start_sequence ();
   27051        65576 :   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
   27052              : 
   27053        65576 :   op_mode = GET_MODE (op0);
   27054        65576 :   if (op_mode == VOIDmode)
   27055            0 :     op_mode = GET_MODE (op1);
   27056              : 
   27057              :   /* We only supports following scalar comparisons that use just 1
   27058              :      instruction: DI/SI/QI/HI/DF/SF/HF.
   27059              :      Unordered/Ordered compare cannot be corretly indentified by
   27060              :      ccmp so they are not supported.  */
   27061        98348 :   if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
   27062        65576 :         || op_mode == QImode || op_mode == DFmode || op_mode == SFmode
   27063        32772 :         || op_mode == HFmode)
   27064        32806 :       || code == ORDERED
   27065        32806 :       || code == UNORDERED)
   27066              :     {
   27067        32770 :       end_sequence ();
   27068        32770 :       return NULL_RTX;
   27069              :     }
   27070              : 
   27071              :   /* Canonicalize the operands according to mode.  */
   27072        32806 :   if (SCALAR_INT_MODE_P (op_mode))
   27073              :     {
   27074        32799 :       if (!nonimmediate_operand (op0, op_mode))
   27075            0 :         op0 = force_reg (op_mode, op0);
   27076        32799 :       if (!x86_64_general_operand (op1, op_mode))
   27077            0 :         op1 = force_reg (op_mode, op1);
   27078              :     }
   27079              :   else
   27080              :     {
   27081              :       /* op0/op1 can be canonicallized from expand_fp_compare, so
   27082              :          just adjust the code to make it generate supported fp
   27083              :          condition.  */
   27084            7 :       if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
   27085              :         {
   27086              :           /* First try to split condition if we don't need to honor
   27087              :              NaNs, as the ORDERED/UNORDERED check always fall
   27088              :              through.  */
   27089            6 :           if (!HONOR_NANS (op_mode))
   27090              :             {
   27091            6 :               rtx_code first_code;
   27092            6 :               split_comparison (code, op_mode, &first_code, &code);
   27093              :             }
   27094              :           /* Otherwise try to swap the operand order and check if
   27095              :              the comparison is supported.  */
   27096              :           else
   27097              :             {
   27098            0 :               code = swap_condition (code);
   27099            0 :               std::swap (op0, op1);
   27100              :             }
   27101              : 
   27102            6 :           if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
   27103              :             {
   27104            0 :               end_sequence ();
   27105            0 :               return NULL_RTX;
   27106              :             }
   27107              :         }
   27108              :     }
   27109              : 
   27110        32806 :   *prep_seq = end_sequence ();
   27111              : 
   27112        32806 :   start_sequence ();
   27113              : 
   27114        32806 :   res = ix86_expand_compare (code, op0, op1);
   27115              : 
   27116        32806 :   if (!res)
   27117              :     {
   27118              :       end_sequence ();
   27119              :       return NULL_RTX;
   27120              :     }
   27121        32806 :   *gen_seq = end_sequence ();
   27122              : 
   27123        32806 :   return res;
   27124              : }
   27125              : 
   27126              : rtx
   27127        32809 : ix86_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
   27128              :                        rtx_code cmp_code, tree treeop0, tree treeop1,
   27129              :                        rtx_code bit_code)
   27130              : {
   27131        32809 :   if (!TARGET_APX_CCMP)
   27132              :     return NULL_RTX;
   27133              : 
   27134        32809 :   rtx op0, op1, target;
   27135        32809 :   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
   27136        32809 :   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
   27137        32809 :   insn_code icode;
   27138        32809 :   rtx_code prev_code;
   27139        32809 :   struct expand_operand ops[5];
   27140        32809 :   int dfv;
   27141              : 
   27142              :   /* Exit early for non integer modes to avoid O(n^2) part of expand_operands. */
   27143        32809 :   cmp_mode = op_mode = TYPE_MODE (TREE_TYPE (treeop0));
   27144              : 
   27145        32809 :   if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
   27146              :         || op_mode == QImode))
   27147              :     return NULL_RTX;
   27148              : 
   27149           32 :   push_to_sequence (*prep_seq);
   27150           32 :   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
   27151              : 
   27152           32 :   icode = code_for_ccmp (op_mode);
   27153              : 
   27154           32 :   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
   27155           32 :   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
   27156           32 :   if (!op0 || !op1)
   27157              :     {
   27158            0 :       end_sequence ();
   27159            0 :       return NULL_RTX;
   27160              :     }
   27161              : 
   27162           32 :   *prep_seq = end_sequence ();
   27163              : 
   27164           32 :   target = gen_rtx_REG (cc_mode, FLAGS_REG);
   27165           32 :   dfv = ix86_get_flags_cc ((rtx_code) cmp_code);
   27166              : 
   27167           32 :   prev_code = GET_CODE (prev);
   27168              :   /* Fixup FP compare code here.  */
   27169           32 :   if (GET_MODE (XEXP (prev, 0)) == CCFPmode)
   27170            7 :     prev_code = ix86_fp_compare_code_to_integer (prev_code);
   27171              : 
   27172           32 :   if (bit_code != AND)
   27173           17 :     prev_code = reverse_condition (prev_code);
   27174              :   else
   27175           15 :     dfv = (int)(dfv ^ 1);
   27176              : 
   27177           32 :   prev = gen_rtx_fmt_ee (prev_code, VOIDmode, XEXP (prev, 0),
   27178              :                          const0_rtx);
   27179              : 
   27180           32 :   create_fixed_operand (&ops[0], target);
   27181           32 :   create_fixed_operand (&ops[1], prev);
   27182           32 :   create_fixed_operand (&ops[2], op0);
   27183           32 :   create_fixed_operand (&ops[3], op1);
   27184           32 :   create_fixed_operand (&ops[4], GEN_INT (dfv));
   27185              : 
   27186           32 :   push_to_sequence (*gen_seq);
   27187           32 :   if (!maybe_expand_insn (icode, 5, ops))
   27188              :     {
   27189            0 :       end_sequence ();
   27190            0 :       return NULL_RTX;
   27191              :     }
   27192              : 
   27193           32 :   *gen_seq = end_sequence ();
   27194              : 
   27195           32 :   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
   27196              : }
   27197              : 
   27198              : /* Attempt to convert a CONST_VECTOR into a bcst_mem_operand.
   27199              :    Returns NULL_RTX if X is cannot be expressed as a suitable
   27200              :    VEC_DUPLICATE in mode MODE.  */
   27201              : 
   27202              : static rtx
   27203           48 : ix86_gen_bcst_mem (machine_mode mode, rtx x)
   27204              : {
   27205           48 :   if (!TARGET_AVX512F
   27206           48 :       || !CONST_VECTOR_P (x)
   27207           64 :       || (!TARGET_AVX512VL && GET_MODE_SIZE (mode) != 64)
   27208          147 :       || !VALID_BCST_MODE_P (GET_MODE_INNER (mode))
   27209              :          /* Disallow HFmode broadcast.  */
   27210          126 :       || GET_MODE_SIZE (GET_MODE_INNER (mode)) < 4)
   27211              :     return NULL_RTX;
   27212              : 
   27213           21 :   rtx cst = CONST_VECTOR_ELT (x, 0);
   27214           21 :   if (!CONST_SCALAR_INT_P (cst)
   27215           15 :       && !CONST_DOUBLE_P (cst)
   27216            0 :       && !CONST_FIXED_P (cst))
   27217              :     return NULL_RTX;
   27218              : 
   27219           21 :   int n_elts = GET_MODE_NUNITS (mode);
   27220           42 :   if (CONST_VECTOR_NUNITS (x) != n_elts)
   27221              :     return NULL_RTX;
   27222              : 
   27223          150 :   for (int i = 1; i < n_elts; i++)
   27224          129 :     if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i)))
   27225              :       return NULL_RTX;
   27226              : 
   27227           42 :   rtx mem = force_const_mem (GET_MODE_INNER (mode), cst);
   27228           21 :   return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem));
   27229              : }
   27230              : 
   27231              : /* Determine the ternlog immediate index that implements 3-operand
   27232              :    ternary logic expression OP.  This uses and modifies the 3 element
   27233              :    array ARGS to record and check the leaves, either 3 REGs, or 2 REGs
   27234              :    and MEM.  Returns an index between 0 and 255 for a valid ternlog,
   27235              :    or -1 if the expression isn't suitable.  */
   27236              : 
   27237              : int
   27238      7127601 : ix86_ternlog_idx (rtx op, rtx *args)
   27239              : {
   27240      7127601 :   int idx0, idx1;
   27241              : 
   27242      7127601 :   if (!op)
   27243              :     return -1;
   27244              : 
   27245      7127601 :   switch (GET_CODE (op))
   27246              :     {
   27247       742846 :     case SUBREG:
   27248       742846 :       if (!register_operand (op, GET_MODE (op)))
   27249              :         return -1;
   27250              :       /* FALLTHRU */
   27251              : 
   27252      3487823 :     case REG:
   27253      3487823 :       if (!args[0])
   27254              :         {
   27255      1808128 :           args[0] = op;
   27256      1808128 :           return 0xf0;
   27257              :         }
   27258      1679695 :       if (rtx_equal_p (op, args[0]))
   27259              :         return 0xf0;
   27260      1653479 :       if (!args[1])
   27261              :         {
   27262      1392408 :           args[1] = op;
   27263      1392408 :           return 0xcc;
   27264              :         }
   27265       261071 :       if (rtx_equal_p (op, args[1]))
   27266              :         return 0xcc;
   27267       244493 :       if (!args[2])
   27268              :         {
   27269       222499 :           args[2] = op;
   27270       222499 :           return 0xaa;
   27271              :         }
   27272        21994 :       if (rtx_equal_p (op, args[2]))
   27273              :         return 0xaa;
   27274              :       return -1;
   27275              : 
   27276        18037 :     case VEC_DUPLICATE:
   27277        18037 :       if (!bcst_mem_operand (op, GET_MODE (op)))
   27278              :         return -1;
   27279          302 :       goto do_mem_operand;
   27280              : 
   27281       362714 :     case MEM:
   27282       362714 :       if (!memory_operand (op, GET_MODE (op)))
   27283              :         return -1;
   27284       362552 :       if (MEM_P (op)
   27285       362552 :           && MEM_VOLATILE_P (op)
   27286       362646 :           && !volatile_ok)
   27287              :         return -1;
   27288              :       /* FALLTHRU */
   27289              : 
   27290       469872 :     case CONST_VECTOR:
   27291       469872 : do_mem_operand:
   27292       469872 :       if (!args[2])
   27293              :         {
   27294       422765 :           args[2] = op;
   27295       422765 :           return 0xaa;
   27296              :         }
   27297              :       /* Maximum of one volatile memory reference per expression.  */
   27298        47107 :       if (side_effects_p (op))
   27299              :         return -1;
   27300        47107 :       if (rtx_equal_p (op, args[2]))
   27301              :         return 0xaa;
   27302              :       /* Check if CONST_VECTOR is the ones-complement of args[2].  */
   27303        47056 :       if (CONST_VECTOR_P (op)
   27304         3380 :           && CONST_VECTOR_P (args[2])
   27305        47301 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27306          245 :                                                           op, GET_MODE (op)),
   27307              :                           args[2]))
   27308              :         return 0x55;
   27309        46869 :       if (!args[0])
   27310              :         {
   27311        45067 :           args[0] = op;
   27312        45067 :           return 0xf0;
   27313              :         }
   27314         1802 :       if (rtx_equal_p (op, args[0]))
   27315              :         return 0xf0;
   27316              :       /* Check if CONST_VECTOR is the ones-complement of args[0].  */
   27317         1802 :       if (CONST_VECTOR_P (op)
   27318          101 :           && CONST_VECTOR_P (args[0])
   27319         1844 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27320           42 :                                                           op, GET_MODE (op)),
   27321              :                           args[0]))
   27322              :         return 0x0f;
   27323         1760 :       if (!args[1])
   27324              :         {
   27325         1748 :           args[1] = op;
   27326         1748 :           return 0xcc;
   27327              :         }
   27328           12 :       if (rtx_equal_p (op, args[1]))
   27329              :         return 0xcc;
   27330              :       /* Check if CONST_VECTOR is the ones-complement of args[1].  */
   27331           12 :       if (CONST_VECTOR_P (op)
   27332            0 :           && CONST_VECTOR_P (args[1])
   27333           12 :           && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
   27334            0 :                                                           op, GET_MODE (op)),
   27335              :                           args[1]))
   27336              :         return 0x33;
   27337              :       return -1;
   27338              : 
   27339       172080 :     case NOT:
   27340       172080 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27341       172080 :       return (idx0 >= 0) ? idx0 ^ 0xff : -1;
   27342              : 
   27343      1267315 :     case AND:
   27344      1267315 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27345      1267315 :       if (idx0 < 0)
   27346              :         return -1;
   27347      1045047 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27348      1045047 :       return (idx1 >= 0) ? idx0 & idx1 : -1;
   27349              : 
   27350       947408 :     case IOR:
   27351       947408 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27352       947408 :       if (idx0 < 0)
   27353              :         return -1;
   27354       703589 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27355       703589 :       return (idx1 >= 0) ? idx0 | idx1 : -1;
   27356              : 
   27357       392660 :     case XOR:
   27358       392660 :       idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   27359       392660 :       if (idx0 < 0)
   27360              :         return -1;
   27361       373899 :       if (vector_all_ones_operand (XEXP (op, 1), GET_MODE (op)))
   27362         6111 :         return idx0 ^ 0xff;
   27363       367788 :       idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
   27364       367788 :       return (idx1 >= 0) ? idx0 ^ idx1 : -1;
   27365              : 
   27366         7492 :     case UNSPEC:
   27367         7492 :       if (XINT (op, 1) != UNSPEC_VTERNLOG
   27368            0 :           || XVECLEN (op, 0) != 4
   27369            0 :           || !CONST_INT_P (XVECEXP (op, 0, 3)))
   27370              :         return -1;
   27371              : 
   27372              :       /* TODO: Handle permuted operands.  */
   27373            0 :       if (ix86_ternlog_idx (XVECEXP (op, 0, 0), args) != 0xf0
   27374            0 :           || ix86_ternlog_idx (XVECEXP (op, 0, 1), args) != 0xcc
   27375            0 :           || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa)
   27376            0 :         return -1;
   27377            0 :       return INTVAL (XVECEXP (op, 0, 3));
   27378              : 
   27379              :     default:
   27380              :       return -1;
   27381              :     }
   27382              : }
   27383              : 
   27384              : /* Return TRUE if OP (in mode MODE) is the leaf of a ternary logic
   27385              :    expression, such as a register or a memory reference.  */
   27386              : 
   27387              : bool
   27388      3308975 : ix86_ternlog_leaf_p (rtx op, machine_mode mode)
   27389              : {
   27390              :   /* We can't use memory_operand here, as it may return a different
   27391              :      value before and after reload (for volatile MEMs) which creates
   27392              :      problems splitting instructions.  */
   27393      3308975 :   return register_operand (op, mode)
   27394       723565 :          || MEM_P (op)
   27395       374707 :          || CONST_VECTOR_P (op)
   27396      3582958 :          || bcst_mem_operand (op, mode);
   27397              : }
   27398              : 
   27399              : /* Test whether OP is a 3-operand ternary logic expression suitable
   27400              :    for use in a ternlog instruction.  */
   27401              : 
   27402              : bool
   27403      2196426 : ix86_ternlog_operand_p (rtx op)
   27404              : {
   27405      2196426 :   rtx op0, op1;
   27406      2196426 :   rtx args[3];
   27407              : 
   27408      2196426 :   args[0] = NULL_RTX;
   27409      2196426 :   args[1] = NULL_RTX;
   27410      2196426 :   args[2] = NULL_RTX;
   27411      2196426 :   int idx = ix86_ternlog_idx (op, args);
   27412      2196426 :   if (idx < 0)
   27413              :     return false;
   27414              : 
   27415              :   /* Don't match simple (binary or unary) expressions.  */
   27416      1785438 :   machine_mode mode = GET_MODE (op);
   27417      1785438 :   switch (GET_CODE (op))
   27418              :     {
   27419       819686 :     case AND:
   27420       819686 :       op0 = XEXP (op, 0);
   27421       819686 :       op1 = XEXP (op, 1);
   27422              : 
   27423              :       /* Prefer pand.  */
   27424       819686 :       if (ix86_ternlog_leaf_p (op0, mode)
   27425       819686 :           && ix86_ternlog_leaf_p (op1, mode))
   27426              :         return false;
   27427              :       /* Prefer pandn.  */
   27428       103796 :       if (GET_CODE (op0) == NOT
   27429        73091 :           && register_operand (XEXP (op0, 0), mode)
   27430       173296 :           && ix86_ternlog_leaf_p (op1, mode))
   27431              :         return false;
   27432              :       break;
   27433              : 
   27434       618722 :     case IOR:
   27435              :       /* Prefer por.  */
   27436       618722 :       if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
   27437       618722 :           && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
   27438              :         return false;
   27439              :       break;
   27440              : 
   27441       317481 :     case XOR:
   27442       317481 :       op1 = XEXP (op, 1);
   27443              :       /* Prefer pxor, or one_cmpl<vmode>2.  */
   27444       317481 :       if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
   27445       317481 :           && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
   27446              :         return false;
   27447              :       break;
   27448              : 
   27449              :     default:
   27450              :       break;
   27451              :     }
   27452              :   return true;
   27453              : }
   27454              : 
   27455              : /* Helper function for ix86_expand_ternlog.  */
   27456              : static rtx
   27457            0 : ix86_expand_ternlog_binop (enum rtx_code code, machine_mode mode,
   27458              :                            rtx op0, rtx op1, rtx target)
   27459              : {
   27460            0 :   if (GET_MODE (op0) != mode)
   27461            0 :     op0 = gen_lowpart (mode, op0);
   27462            0 :   if (GET_MODE (op1) != mode)
   27463            0 :     op1 = gen_lowpart (mode, op1);
   27464              : 
   27465            0 :   if (CONST_VECTOR_P (op0))
   27466            0 :     op0 = validize_mem (force_const_mem (mode, op0));
   27467            0 :   if (CONST_VECTOR_P (op1))
   27468            0 :     op1 = validize_mem (force_const_mem (mode, op1));
   27469              : 
   27470            0 :   if (!register_operand (op0, mode))
   27471              :     {
   27472            0 :       if (!register_operand (op1, mode))
   27473              :         {
   27474              :           /* We can't use force_reg (op0, mode).  */
   27475            0 :           rtx reg = gen_reg_rtx (mode);
   27476            0 :           emit_move_insn (reg, op0);
   27477            0 :           op0 = reg;
   27478              :         }
   27479              :       else
   27480              :         std::swap (op0, op1);
   27481              :     }
   27482            0 :   rtx ops[3] = { target, op0, op1 };
   27483            0 :   ix86_expand_vector_logical_operator (code, mode, ops);
   27484            0 :   return target;
   27485              : }
   27486              : 
   27487              : 
   27488              : /* Helper function for ix86_expand_ternlog.  */
   27489              : static rtx
   27490            0 : ix86_expand_ternlog_andnot (machine_mode mode, rtx op0, rtx op1, rtx target)
   27491              : {
   27492            0 :   if (GET_MODE (op0) != mode)
   27493            0 :     op0 = gen_lowpart (mode, op0);
   27494            0 :   op0 = gen_rtx_NOT (mode, op0);
   27495            0 :   if (GET_MODE (op1) != mode)
   27496            0 :     op1 = gen_lowpart (mode, op1);
   27497            0 :   if (CONST_VECTOR_P (op1))
   27498            0 :     op1 = validize_mem (force_const_mem (mode, op1));
   27499            0 :   emit_move_insn (target, gen_rtx_AND (mode, op0, op1));
   27500            0 :   return target;
   27501              : }
   27502              : 
   27503              : /* Expand a 3-operand ternary logic expression.  Return TARGET. */
   27504              : rtx
   27505         2354 : ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
   27506              :                      rtx target)
   27507              : {
   27508         2354 :   rtx tmp0, tmp1, tmp2;
   27509              : 
   27510         2354 :   if (!target)
   27511            3 :     target = gen_reg_rtx (mode);
   27512              : 
   27513              :   /* Canonicalize ternlog index for degenerate (duplicated) operands.  */
   27514         2354 :   if (rtx_equal_p (op0, op1) && rtx_equal_p (op0, op2))
   27515            0 :     switch (idx & 0x81)
   27516              :       {
   27517              :       case 0x00:
   27518              :         idx = 0x00;
   27519              :         break;
   27520              :       case 0x01:
   27521              :         idx = 0x0f;
   27522              :         break;
   27523              :       case 0x80:
   27524              :         idx = 0xf0;
   27525              :         break;
   27526              :       case 0x81:
   27527              :         idx = 0xff;
   27528              :         break;
   27529              :       }
   27530              : 
   27531         2354 :   switch (idx & 0xff)
   27532              :     {
   27533            0 :     case 0x00:
   27534            0 :       if ((!op0 || !side_effects_p (op0))
   27535            0 :           && (!op1 || !side_effects_p (op1))
   27536            0 :           && (!op2 || !side_effects_p (op2)))
   27537              :         {
   27538            0 :           emit_move_insn (target, CONST0_RTX (mode));
   27539            0 :           return target;
   27540              :         }
   27541              :       break;
   27542              : 
   27543            0 :     case 0x0a: /* ~a&c */
   27544            0 :       if ((!op1 || !side_effects_p (op1))
   27545            0 :           && op0 && register_operand (op0, mode)
   27546            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27547            0 :         return ix86_expand_ternlog_andnot (mode, op0, op2, target);
   27548              :       break;
   27549              : 
   27550            0 :     case 0x0c: /* ~a&b */
   27551            0 :       if ((!op2 || !side_effects_p (op2))
   27552            0 :           && op0 && register_operand (op0, mode)
   27553            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode))
   27554            0 :         return ix86_expand_ternlog_andnot (mode, op0, op1, target);
   27555              :       break;
   27556              : 
   27557           14 :     case 0x0f:  /* ~a */
   27558            0 :       if ((!op1 || !side_effects_p (op1))
   27559           14 :           && (!op2 || !side_effects_p (op2))
   27560           28 :           && op0)
   27561              :         {
   27562           14 :           emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode)));
   27563           14 :           return target;
   27564              :         }
   27565              :       break;
   27566              : 
   27567            0 :     case 0x22: /* ~b&c */
   27568            0 :       if ((!op0 || !side_effects_p (op0))
   27569            0 :           && op1 && register_operand (op1, mode)
   27570            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27571            0 :         return ix86_expand_ternlog_andnot (mode, op1, op2, target);
   27572              :       break;
   27573              : 
   27574            0 :     case 0x30: /* ~b&a */
   27575            0 :       if ((!op2 || !side_effects_p (op2))
   27576            0 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27577            0 :           && op1 && register_operand (op1, mode))
   27578            0 :         return ix86_expand_ternlog_andnot (mode, op1, op0, target);
   27579              :       break;
   27580              : 
   27581            0 :     case 0x33:  /* ~b */
   27582            0 :       if ((!op0 || !side_effects_p (op0))
   27583            0 :           && (!op2 || !side_effects_p (op2))
   27584            0 :           && op1)
   27585              :         {
   27586            0 :           emit_move_insn (target, gen_rtx_XOR (mode, op1, CONSTM1_RTX (mode)));
   27587            0 :           return target;
   27588              :         }
   27589              :       break;
   27590              : 
   27591            0 :     case 0x3c:  /* a^b */
   27592            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27593            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27594            0 :           && (!op2 || !side_effects_p (op2)))
   27595            0 :         return ix86_expand_ternlog_binop (XOR, mode, op0, op1, target);
   27596              :       break;
   27597              : 
   27598            0 :     case 0x44: /* ~c&b */
   27599            0 :       if ((!op0 || !side_effects_p (op0))
   27600            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27601            0 :           && op2 && register_operand (op2, mode))
   27602            0 :         return ix86_expand_ternlog_andnot (mode, op2, op1, target);
   27603              :       break;
   27604              : 
   27605            2 :     case 0x50: /* ~c&a */
   27606            0 :       if ((!op1 || !side_effects_p (op1))
   27607            2 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27608            4 :           && op2 && register_operand (op2, mode))
   27609            0 :         return ix86_expand_ternlog_andnot (mode, op2, op0, target);
   27610              :       break;
   27611              : 
   27612            4 :     case 0x55:  /* ~c */
   27613            1 :       if ((!op0 || !side_effects_p (op0))
   27614            4 :           && (!op1 || !side_effects_p (op1))
   27615            8 :           && op2)
   27616              :         {
   27617            4 :           emit_move_insn (target, gen_rtx_XOR (mode, op2, CONSTM1_RTX (mode)));
   27618            4 :           return target;
   27619              :         }
   27620              :       break;
   27621              : 
   27622            0 :     case 0x5a:  /* a^c */
   27623            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27624            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode)
   27625            0 :           && (!op1 || !side_effects_p (op1)))
   27626            0 :         return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target);
   27627              :       break;
   27628              : 
   27629            0 :     case 0x66:  /* b^c */
   27630            0 :       if ((!op0 || !side_effects_p (op0))
   27631            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27632            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27633            0 :         return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target);
   27634              :       break;
   27635              : 
   27636            0 :     case 0x88:  /* b&c */
   27637            0 :       if ((!op0 || !side_effects_p (op0))
   27638            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27639            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27640            0 :         return ix86_expand_ternlog_binop (AND, mode, op1, op2, target);
   27641              :       break;
   27642              : 
   27643            0 :     case 0xa0:  /* a&c */
   27644            0 :       if ((!op1 || !side_effects_p (op1))
   27645            0 :           && op0 && ix86_ternlog_leaf_p (op0, mode)
   27646            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27647            0 :         return ix86_expand_ternlog_binop (AND, mode, op0, op2, target);
   27648              :       break;
   27649              : 
   27650            0 :     case 0xaa:  /* c */
   27651            0 :       if ((!op0 || !side_effects_p (op0))
   27652            0 :           && (!op1 || !side_effects_p (op1))
   27653            0 :           && op2)
   27654              :         {
   27655            0 :           if (GET_MODE (op2) != mode)
   27656            0 :             op2 = gen_lowpart (mode, op2);
   27657            0 :           emit_move_insn (target, op2);
   27658            0 :           return target;
   27659              :         }
   27660              :       break;
   27661              : 
   27662            0 :     case 0xc0:  /* a&b */
   27663            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27664            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27665            0 :           && (!op2 || !side_effects_p (op2)))
   27666            0 :         return ix86_expand_ternlog_binop (AND, mode, op0, op1, target);
   27667              :       break;
   27668              : 
   27669            0 :     case 0xcc:  /* b */
   27670            0 :       if ((!op0 || !side_effects_p (op0))
   27671            0 :           && op1
   27672            0 :           && (!op2 || !side_effects_p (op2)))
   27673              :         {
   27674            0 :           if (GET_MODE (op1) != mode)
   27675            0 :             op1 = gen_lowpart (mode, op1);
   27676            0 :           emit_move_insn (target, op1);
   27677            0 :           return target;
   27678              :         }
   27679              :       break;
   27680              : 
   27681            0 :     case 0xee:  /* b|c */
   27682            0 :       if ((!op0 || !side_effects_p (op0))
   27683            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27684            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode))
   27685            0 :         return ix86_expand_ternlog_binop (IOR, mode, op1, op2, target);
   27686              :       break;
   27687              : 
   27688            6 :     case 0xf0:  /* a */
   27689            6 :       if (op0
   27690            6 :           && (!op1 || !side_effects_p (op1))
   27691           12 :           && (!op2 || !side_effects_p (op2)))
   27692              :         {
   27693            6 :           if (GET_MODE (op0) != mode)
   27694            0 :             op0 = gen_lowpart (mode, op0);
   27695            6 :           emit_move_insn (target, op0);
   27696            6 :           return target;
   27697              :         }
   27698              :       break;
   27699              : 
   27700            0 :     case 0xfa:  /* a|c */
   27701            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27702            0 :           && op2 && ix86_ternlog_leaf_p (op2, mode)
   27703            0 :           && (!op1 || !side_effects_p (op1)))
   27704            0 :         return ix86_expand_ternlog_binop (IOR, mode, op0, op2, target);
   27705              :       break;
   27706              : 
   27707            0 :     case 0xfc:  /* a|b */
   27708            0 :       if (op0 && ix86_ternlog_leaf_p (op0, mode)
   27709            0 :           && op1 && ix86_ternlog_leaf_p (op1, mode)
   27710            0 :           && (!op2 || !side_effects_p (op2)))
   27711            0 :         return ix86_expand_ternlog_binop (IOR, mode, op0, op1, target);
   27712              :       break;
   27713              : 
   27714            0 :     case 0xff:
   27715            0 :       if ((!op0 || !side_effects_p (op0))
   27716            0 :           && (!op1 || !side_effects_p (op1))
   27717            0 :           && (!op2 || !side_effects_p (op2)))
   27718              :         {
   27719            0 :           emit_move_insn (target, CONSTM1_RTX (mode));
   27720            0 :           return target;
   27721              :         }
   27722              :       break;
   27723              :     }
   27724              : 
   27725         2330 :   if (!register_operand (op0, mode))
   27726              :     {
   27727              :       /* We can't use force_reg (mode, op0).  */
   27728           12 :       tmp0 = gen_reg_rtx (GET_MODE (op0));
   27729           12 :       emit_move_insn (tmp0,op0);
   27730              :     }
   27731              :   else
   27732              :     tmp0 = op0;
   27733         2330 :   if (GET_MODE (tmp0) != mode)
   27734            0 :     tmp0 = gen_lowpart (mode, tmp0);
   27735              : 
   27736         2330 :   if (!op1 || rtx_equal_p (op0, op1))
   27737            6 :     tmp1 = copy_rtx (tmp0);
   27738         2324 :   else if (!register_operand (op1, mode))
   27739              :     {
   27740              :       /* We can't use force_reg (mode, op1).  */
   27741           28 :       tmp1 = gen_reg_rtx (GET_MODE (op1));
   27742           28 :       emit_move_insn (tmp1, op1);
   27743              :     }
   27744              :   else
   27745              :     tmp1 = op1;
   27746         2330 :   if (GET_MODE (tmp1) != mode)
   27747            0 :     tmp1 = gen_lowpart (mode, tmp1);
   27748              : 
   27749         2330 :   if (!op2 || rtx_equal_p (op0, op2))
   27750           71 :     tmp2 = copy_rtx (tmp0);
   27751         2259 :   else if (rtx_equal_p (op1, op2))
   27752            0 :     tmp2 = copy_rtx (tmp1);
   27753         2259 :   else if (CONST_VECTOR_P (op2))
   27754              :     {
   27755           43 :       if (GET_MODE (op2) != mode)
   27756            0 :         op2 = gen_lowpart (mode, op2);
   27757           43 :       tmp2 = ix86_gen_bcst_mem (mode, op2);
   27758           43 :       if (!tmp2)
   27759              :         {
   27760           25 :           machine_mode bcst32_mode = mode;
   27761           25 :           machine_mode bcst64_mode = mode;
   27762           25 :           switch (mode)
   27763              :             {
   27764            1 :             case V1TImode:
   27765            1 :             case V4SImode:
   27766            1 :             case V4SFmode:
   27767            1 :             case V8HImode:
   27768            1 :             case V16QImode:
   27769            1 :               bcst32_mode = V4SImode;
   27770            1 :               bcst64_mode = V2DImode;
   27771            1 :               break;
   27772              : 
   27773            0 :             case V2TImode:
   27774            0 :             case V8SImode:
   27775            0 :             case V8SFmode:
   27776            0 :             case V16HImode:
   27777            0 :             case V32QImode:
   27778            0 :               bcst32_mode = V8SImode;
   27779            0 :               bcst64_mode = V4DImode;
   27780            0 :               break;
   27781              : 
   27782            3 :             case V4TImode:
   27783            3 :             case V16SImode:
   27784            3 :             case V16SFmode:
   27785            3 :             case V32HImode:
   27786            3 :             case V64QImode:
   27787            3 :               bcst32_mode = V16SImode;
   27788            3 :               bcst64_mode = V8DImode;
   27789            3 :               break;
   27790              : 
   27791              :             default:
   27792              :               break;
   27793              :             }
   27794              : 
   27795           25 :           if (bcst32_mode != mode)
   27796              :             {
   27797            4 :               tmp2 = gen_lowpart (bcst32_mode, op2);
   27798            4 :               if (ix86_gen_bcst_mem (bcst32_mode, tmp2))
   27799              :                 {
   27800            3 :                   tmp2 = ix86_expand_ternlog (bcst32_mode,
   27801            3 :                                               gen_lowpart (bcst32_mode, tmp0),
   27802            3 :                                               gen_lowpart (bcst32_mode, tmp1),
   27803              :                                               tmp2, idx, NULL_RTX);
   27804            3 :                   emit_move_insn (target, gen_lowpart (mode, tmp2));
   27805            3 :                   return target;
   27806              :                 }
   27807              :             }
   27808              : 
   27809           22 :           if (bcst64_mode != mode)
   27810              :             {
   27811            1 :               tmp2 = gen_lowpart (bcst64_mode, op2);
   27812            1 :               if (ix86_gen_bcst_mem (bcst64_mode, tmp2))
   27813              :                 {
   27814            0 :                   tmp2 = ix86_expand_ternlog (bcst64_mode,
   27815            0 :                                               gen_lowpart (bcst64_mode, tmp0),
   27816            0 :                                               gen_lowpart (bcst64_mode, tmp1),
   27817              :                                               tmp2, idx, NULL_RTX);
   27818            0 :                   emit_move_insn (target, gen_lowpart (mode, tmp2));
   27819            0 :                   return target;
   27820              :                 }
   27821              :             }
   27822              : 
   27823           22 :           tmp2 = force_const_mem (mode, op2);
   27824           22 :           rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
   27825           22 :           tmp2 = validize_mem (tmp2);
   27826           22 :           if (bcast)
   27827              :             {
   27828           12 :               rtx reg2 = gen_reg_rtx (mode);
   27829           12 :               bool ok = ix86_expand_vector_init_duplicate (false, mode,
   27830              :                                                            reg2, bcast);
   27831           12 :               if (ok)
   27832         2327 :                 tmp2 = reg2;
   27833              :             }
   27834              :         }
   27835              :     }
   27836              :   else
   27837              :     tmp2 = op2;
   27838         2327 :   if (GET_MODE (tmp2) != mode)
   27839            0 :     tmp2 = gen_lowpart (mode, tmp2);
   27840              :   /* Some memory_operands are not vector_memory_operands.  */
   27841         2327 :   if (!bcst_vector_operand (tmp2, mode))
   27842            0 :     tmp2 = force_reg (mode, tmp2);
   27843              : 
   27844         2327 :   rtvec vec = gen_rtvec (4, tmp0, tmp1, tmp2, GEN_INT (idx));
   27845         2327 :   emit_move_insn (target, gen_rtx_UNSPEC (mode, vec, UNSPEC_VTERNLOG));
   27846         2327 :   return target;
   27847              : }
   27848              : 
   27849              : /* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
   27850              : 
   27851              : static const uint64_t matrix_ashift[8] =
   27852              : {
   27853              :   0,
   27854              :   0x0001020408102040, /* 1 l */
   27855              :   0x0000010204081020, /* 2 l */
   27856              :   0x0000000102040810, /* 3 l */
   27857              :   0x0000000001020408, /* 4 l */
   27858              :   0x0000000000010204, /* 5 l */
   27859              :   0x0000000000000102, /* 6 l */
   27860              :   0x0000000000000001  /* 7 l */
   27861              : };
   27862              : 
   27863              : static const uint64_t matrix_lshiftrt[8] =
   27864              : {
   27865              :   0,
   27866              :   0x0204081020408000, /* 1 r */
   27867              :   0x0408102040800000, /* 2 r */
   27868              :   0x0810204080000000, /* 3 r */
   27869              :   0x1020408000000000, /* 4 r */
   27870              :   0x2040800000000000, /* 5 r */
   27871              :   0x4080000000000000, /* 6 r */
   27872              :   0x8000000000000000  /* 7 r */
   27873              : };
   27874              : 
   27875              : static const uint64_t matrix_ashiftrt[8] =
   27876              : {
   27877              :   0,
   27878              :   0x0204081020408080, /* 1 r */
   27879              :   0x0408102040808080, /* 2 r */
   27880              :   0x0810204080808080, /* 3 r */
   27881              :   0x1020408080808080, /* 4 r */
   27882              :   0x2040808080808080, /* 5 r */
   27883              :   0x4080808080808080, /* 6 r */
   27884              :   0x8080808080808080  /* 7 r */
   27885              : };
   27886              : 
   27887              : static const uint64_t matrix_rotate[8] =
   27888              : {
   27889              :   0,
   27890              :   0x8001020408102040, /* 1 rol8 */
   27891              :   0x4080010204081020, /* 2 rol8 */
   27892              :   0x2040800102040810, /* 3 rol8 */
   27893              :   0x1020408001020408, /* 4 rol8 */
   27894              :   0x0810204080010204, /* 5 rol8 */
   27895              :   0x0408102040800102, /* 6 rol8 */
   27896              :   0x0204081020408001  /* 7 rol8 */
   27897              : };
   27898              : 
   27899              : static const uint64_t matrix_rotatert[8] =
   27900              : {
   27901              :   0,
   27902              :   0x0204081020408001, /* 1 ror8 */
   27903              :   0x0408102040800102, /* 2 ror8 */
   27904              :   0x0810204080010204, /* 3 ror8 */
   27905              :   0x1020408001020408, /* 4 ror8 */
   27906              :   0x2040800102040810, /* 5 ror8 */
   27907              :   0x4080010204081020, /* 6 ror8 */
   27908              :   0x8001020408102040  /* 7 ror8 */
   27909              : };
   27910              : 
   27911              : /* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
   27912              :    for CODE and shift count COUNT into register with vector of size of SRC.  */
   27913              : 
   27914              : rtx
   27915          189 : ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
   27916              : {
   27917          189 :   machine_mode mode = GET_MODE (src);
   27918          189 :   const uint64_t *matrix;
   27919          189 :   unsigned shift = INTVAL (count) & 7;
   27920          189 :   gcc_assert (shift > 0 && shift < 8);
   27921              : 
   27922          189 :   switch (code)
   27923              :     {
   27924              :     case ASHIFT:
   27925              :       matrix = matrix_ashift;
   27926              :       break;
   27927           26 :     case ASHIFTRT:
   27928           26 :       matrix = matrix_ashiftrt;
   27929           26 :       break;
   27930           28 :     case LSHIFTRT:
   27931           28 :       matrix = matrix_lshiftrt;
   27932           28 :       break;
   27933           32 :     case ROTATE:
   27934           32 :       matrix = matrix_rotate;
   27935           32 :       break;
   27936           33 :     case ROTATERT:
   27937           33 :       matrix = matrix_rotatert;
   27938           33 :       break;
   27939            0 :     default:
   27940            0 :       gcc_unreachable ();
   27941              :     }
   27942              : 
   27943          189 :   int nelts = GET_MODE_NUNITS (mode);
   27944          189 :   rtvec vec = rtvec_alloc (nelts);
   27945          189 :   uint64_t ma = matrix[shift];
   27946         7741 :   for (int i = 0; i < nelts; i++)
   27947         7552 :     RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
   27948              : 
   27949          189 :   return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
   27950              : }
   27951              : 
   27952              : /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
   27953              : 
   27954              : void
   27955           63 : ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode)
   27956              : {
   27957           63 :   machine_mode out_mode = GET_MODE (output);
   27958           63 :   machine_mode in_mode = GET_MODE (input);
   27959           63 :   int len = GET_MODE_SIZE (in_mode);
   27960          252 :   gcc_assert (len == GET_MODE_SIZE (cvt_mode)
   27961              :               && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
   27962              :               && (REG_P (input) || SUBREG_P (input)));
   27963           63 :   scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
   27964          126 :   int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
   27965           63 :   int out_innersize = GET_MODE_SIZE (inner_out_mode);
   27966              : 
   27967           63 :   struct expand_vec_perm_d d;
   27968           63 :   d.target = gen_reg_rtx (cvt_mode);
   27969           63 :   d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
   27970           63 :   d.op1 = d.op0;
   27971           63 :   d.vmode = cvt_mode;
   27972           63 :   d.nelt = GET_MODE_NUNITS (cvt_mode);
   27973           63 :   d.testing_p = false;
   27974           63 :   d.one_operand_p = true;
   27975              : 
   27976              :   /* Init perm. Put the needed bits of input in order and
   27977              :      fill the rest of bits by default.  */
   27978          687 :   for (int i = 0; i < d.nelt; ++i)
   27979              :     {
   27980          624 :       d.perm[i] = i;
   27981         1248 :       if (i < GET_MODE_NUNITS (out_mode))
   27982          246 :         d.perm[i] = i * (in_innersize / out_innersize);
   27983              :     }
   27984              : 
   27985           63 :   bool ok = ix86_expand_vec_perm_const_1(&d);
   27986           63 :   gcc_assert (ok);
   27987           63 :   emit_move_insn (output, gen_lowpart (out_mode, d.target));
   27988           63 : }
   27989              : 
   27990              : /* Implement truncv8sfv8bf2 with vector permutation.  */
   27991              : void
   27992            8 : ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
   27993              : {
   27994            8 :   machine_mode vperm_mode, src_mode = GET_MODE (src);
   27995            8 :   switch (src_mode)
   27996              :     {
   27997              :     case V16SFmode:
   27998              :       vperm_mode = V32BFmode;
   27999              :       break;
   28000            2 :     case V8SFmode:
   28001            2 :       vperm_mode = V16BFmode;
   28002            2 :       break;
   28003            4 :     case V4SFmode:
   28004            4 :       vperm_mode = V8BFmode;
   28005            4 :       break;
   28006            0 :     default:
   28007            0 :       gcc_unreachable ();
   28008              :     }
   28009              : 
   28010            8 :   int nelt = GET_MODE_NUNITS (vperm_mode);
   28011            8 :   vec_perm_builder sel (nelt, nelt, 1);
   28012            8 :   sel.quick_grow (nelt);
   28013          136 :   for (int i = 0; i != nelt; i++)
   28014          128 :     sel[i] = (2 * i + 1) % nelt;
   28015           16 :   vec_perm_indices indices (sel, 1, nelt);
   28016              : 
   28017            8 :   rtx target = gen_reg_rtx (vperm_mode);
   28018            8 :   rtx op0 = lowpart_subreg (vperm_mode,
   28019              :                             force_reg (src_mode, src),
   28020              :                             src_mode);
   28021            8 :   bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
   28022              :                                               target, op0, op0, indices);
   28023            8 :   gcc_assert (ok);
   28024            8 :   emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
   28025            8 : }
   28026              : 
   28027              : /* Implement extendv8bf2v8sf2 with vector permutation.  */
   28028              : void
   28029            8 : ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src)
   28030              : {
   28031            8 :   machine_mode vperm_mode, src_mode = GET_MODE (src);
   28032            8 :   switch (src_mode)
   28033              :     {
   28034              :     case V16BFmode:
   28035              :       vperm_mode = V32BFmode;
   28036              :       break;
   28037            2 :     case V8BFmode:
   28038            2 :       vperm_mode = V16BFmode;
   28039            2 :       break;
   28040            4 :     case V4BFmode:
   28041            4 :       vperm_mode = V8BFmode;
   28042            4 :       break;
   28043            0 :     default:
   28044            0 :       gcc_unreachable ();
   28045              :     }
   28046              : 
   28047            8 :   int nelt = GET_MODE_NUNITS (vperm_mode);
   28048            8 :   vec_perm_builder sel (nelt, nelt, 1);
   28049            8 :   sel.quick_grow (nelt);
   28050          136 :   for (int i = 0, k = 0, j = nelt; i != nelt; i++)
   28051          128 :     sel[i] = i & 1 ? j++ : k++;
   28052              : 
   28053           16 :   vec_perm_indices indices (sel, 2, nelt);
   28054              : 
   28055            8 :   rtx target = gen_reg_rtx (vperm_mode);
   28056            8 :   rtx op1 = lowpart_subreg (vperm_mode,
   28057              :                             force_reg (src_mode, src),
   28058              :                             src_mode);
   28059            8 :   rtx op0 = CONST0_RTX (vperm_mode);
   28060            8 :   bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
   28061              :                                               target, op0, op1, indices);
   28062            8 :   gcc_assert (ok);
   28063            8 :   emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
   28064            8 : }
   28065              : 
   28066              : 
   28067              : #include "gt-i386-expand.h"
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.